buzzsaw 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.gitignore +14 -0
- data/Gemfile +4 -0
- data/LICENSE.txt +22 -0
- data/README.md +147 -0
- data/Rakefile +2 -0
- data/buzzsaw.gemspec +28 -0
- data/lib/buzzsaw/document.rb +16 -0
- data/lib/buzzsaw/dsl.rb +247 -0
- data/lib/buzzsaw/version.rb +3 -0
- data/lib/buzzsaw.rb +7 -0
- data/spec/dsl_spec.rb +123 -0
- data/spec/fixtures/sample.html +34 -0
- data/spec/spec_helper.rb +12 -0
- metadata +158 -0
checksums.yaml
ADDED
@@ -0,0 +1,7 @@
|
|
1
|
+
---
|
2
|
+
SHA1:
|
3
|
+
metadata.gz: d88ab56bde5c005eaa147b077b71083da8287978
|
4
|
+
data.tar.gz: 710737bfcda47736888b26f45a97d5508b12df3a
|
5
|
+
SHA512:
|
6
|
+
metadata.gz: f59b9ef5ddffa5d885cea106eb1fd6db06037114462e6df045d8ce4e9ee97c1e38303c0bc4df14776565c3f185d9d5f05560c5118c12d59ccbf4b85e64c927b0
|
7
|
+
data.tar.gz: a1ef7c9c9ce90f4195fa5bfcc367f5ffc18eedf18600256976f06e608e9d5c2c36d23d6af63a825fa16bed6d9747e8c8a66af2ed1795745cc7b883bbac99035f
|
data/.gitignore
ADDED
data/Gemfile
ADDED
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2015 Jon Stokes
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,147 @@
|
|
1
|
+
# Buzzsaw
|
2
|
+
|
3
|
+
A DSL that wraps around `Nokogiri` and is used by stretched.io for web scraping.
|
4
|
+
|
5
|
+
## Installation
|
6
|
+
|
7
|
+
Add this line to your application's Gemfile:
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
gem 'buzzsaw'
|
11
|
+
```
|
12
|
+
|
13
|
+
And then execute:
|
14
|
+
|
15
|
+
$ bundle
|
16
|
+
|
17
|
+
Or install it yourself as:
|
18
|
+
|
19
|
+
$ gem install buzzsaw
|
20
|
+
|
21
|
+
## Usage
|
22
|
+
|
23
|
+
This gem is what stretched.io uses for its DSL -- both the JSON-based one and the
|
24
|
+
scripting one. You can use it independently, though.
|
25
|
+
|
26
|
+
## Query DSL
|
27
|
+
|
28
|
+
### find_by_xpath
|
29
|
+
|
30
|
+
Most of the time when I'm scraping the web, I just want to find the first
|
31
|
+
bit of matching text at a matching xpath. That's why `find_by_xpath` is the workhorse
|
32
|
+
of this query DSL.
|
33
|
+
|
34
|
+
This method takes the following arguments:
|
35
|
+
- `xpath`: The xpath query string of the nodes that you want to search for a given pattern. This argument is mandatory.
|
36
|
+
- `match`: A regex that the text of the xpath node should match.
|
37
|
+
- `capture`: A regex that pulls only the matching text out of the matched string and returns it.
|
38
|
+
- `pattern`: If the `pattern` argument is present, then `match = capture = pattern`.
|
39
|
+
- `label`: If this is present, then any positive match will return the string supplied by this argument.
|
40
|
+
|
41
|
+
Here's a look at how `find_by_xpath` works in practice.
|
42
|
+
|
43
|
+
Let's say that you want to extract the price of `product2` from the following bit of HTML in `products.html`:
|
44
|
+
|
45
|
+
```html
|
46
|
+
<div id="product1-details">
|
47
|
+
<ul>
|
48
|
+
<li>Status: In-stock</li>
|
49
|
+
<li>UPC: 00110012232</li>
|
50
|
+
<li>Price: $12.99</li>
|
51
|
+
</ul>
|
52
|
+
</div>
|
53
|
+
|
54
|
+
<div id="product2-details">
|
55
|
+
<ul>
|
56
|
+
<li>Status: In-stock</li>
|
57
|
+
<li>UPC: 00110012232</li>
|
58
|
+
<li>SKU: ITEM-2</li>
|
59
|
+
<li>Price: $12.99</li>
|
60
|
+
</ul>
|
61
|
+
</div>
|
62
|
+
```
|
63
|
+
You might use `find_by_xpath` as follows:
|
64
|
+
```ruby
|
65
|
+
source = File.open { |f| f.read("products.html") }
|
66
|
+
buzz = Buzzsaw::Document.new(source, format: :html)
|
67
|
+
|
68
|
+
buzz.find_by_xpath(
|
69
|
+
xpath: '//div[@id="product2-details"]//li',
|
70
|
+
pattern: /\$[0-9]+\.[0-9]+/
|
71
|
+
)
|
72
|
+
#=> "$12.99"
|
73
|
+
```
|
74
|
+
If for whatever reason you wanted that entire price node, you could do:
|
75
|
+
|
76
|
+
```ruby
|
77
|
+
buzz.find_by_xpath(
|
78
|
+
xpath: '//div[@id="product2-details"]//li',
|
79
|
+
match: /\$[0-9]+\.[0-9]+/
|
80
|
+
)
|
81
|
+
#=> "Price: $12.99"
|
82
|
+
```
|
83
|
+
Now let's say that you only want "12.99", without the dollar sign. You could do
|
84
|
+
that as follows:
|
85
|
+
```ruby
|
86
|
+
buzz.find_by_xpath(
|
87
|
+
xpath: '//div[@id="product2-details"]//li',
|
88
|
+
match: /\$[0-9]+\.[0-9]+/
|
89
|
+
capture: /[0-9]+\.[0-9]/
|
90
|
+
)
|
91
|
+
#=> "12.99"
|
92
|
+
```
|
93
|
+
Sometimes you might want to return a specific bit of text if you find a match on a page.
|
94
|
+
This can be done with the `label` argument.
|
95
|
+
|
96
|
+
For instance, what if we want to the `find_by_xpath` function to return the token
|
97
|
+
`in_stock` if we use it to find that the item is in stock. We'd do that as follows:
|
98
|
+
```ruby
|
99
|
+
buzz.find_by_xpath(
|
100
|
+
xpath: '//div[@id="product2-details"]//li',
|
101
|
+
pattern: /Status: In-stock/
|
102
|
+
label: 'in_stock'
|
103
|
+
)
|
104
|
+
#=> in_stock
|
105
|
+
```
|
106
|
+
These examples are contrived, but you get the idea.
|
107
|
+
|
108
|
+
### collect_by_xpath
|
109
|
+
Consider the list of product details above. Let's say that I want
|
110
|
+
it capture and store those details as a human-readable string. If I have a `Nokogiri::Document` called
|
111
|
+
`doc` with the above HTML in it, then look at the following:
|
112
|
+
|
113
|
+
```ruby
|
114
|
+
doc.xpath("//div[@id='product2-details']//li").text
|
115
|
+
#=> Status: In-stockUPC: 00110012232SKU: ITEM-2Price: $12.99
|
116
|
+
```
|
117
|
+
All of the nodes are crammed together, but it would be nice if I could insert
|
118
|
+
a space in between them. That's one place where `collect_by_xpath` helps.
|
119
|
+
```ruby
|
120
|
+
buzz.collect_by_xpath(
|
121
|
+
xpath: "//div[@id='product2-details']//li",
|
122
|
+
join: ' '
|
123
|
+
)
|
124
|
+
#=> Status: In-stock UPC: 00110012232 SKU: ITEM-2 Price: $12.99
|
125
|
+
```
|
126
|
+
The `collect_by_xpath` function finds all of the matching nodes and concatenates
|
127
|
+
their text, using the character(s) supplied by optional `join` as a delimiter.
|
128
|
+
|
129
|
+
This method also takes the same `match`, `capture`, and `pattern` arguments
|
130
|
+
as `find_by_xpath`, and they do the same thing. You can use the `match` argument to
|
131
|
+
collect only matching nodes, and the `capture` argument to filter the final string.
|
132
|
+
|
133
|
+
Finally, this function also takes the `label` argument.
|
134
|
+
### find_in_table
|
135
|
+
This method is useful for pulling text out of tables, one of the most annoying
|
136
|
+
jobs in web scraping. The `find_in_table` method takes the following arguments:
|
137
|
+
|
138
|
+
- `row`: Either a regex for matching a row, or an integer row index. This argument is mandatory.
|
139
|
+
- `column`: Either a regex for matching a column, or an integer column index.
|
140
|
+
|
141
|
+
## Contributing
|
142
|
+
|
143
|
+
1. Fork it ( https://github.com/jonstokes/Buzzsaw/fork )
|
144
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
145
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
146
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
147
|
+
5. Create a new Pull Request
|
data/Rakefile
ADDED
data/buzzsaw.gemspec
ADDED
@@ -0,0 +1,28 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
lib = File.expand_path('../lib', __FILE__)
|
3
|
+
$LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
|
4
|
+
require 'buzzsaw/version'
|
5
|
+
|
6
|
+
Gem::Specification.new do |spec|
|
7
|
+
spec.name = "buzzsaw"
|
8
|
+
spec.version = Buzzsaw::VERSION
|
9
|
+
spec.authors = ["Jon Stokes"]
|
10
|
+
spec.email = ["jon@jonstokes.com"]
|
11
|
+
spec.summary = %q{A web scraping DSL built on Nokogiri}
|
12
|
+
spec.homepage = ""
|
13
|
+
spec.license = "MIT"
|
14
|
+
|
15
|
+
spec.files = `git ls-files -z`.split("\x0")
|
16
|
+
spec.executables = spec.files.grep(%r{^bin/}) { |f| File.basename(f) }
|
17
|
+
spec.test_files = spec.files.grep(%r{^(test|spec|features)/})
|
18
|
+
spec.require_paths = ["lib"]
|
19
|
+
|
20
|
+
spec.add_dependency "htmlentities", "~> 4.3"
|
21
|
+
spec.add_dependency "nokogiri", "~> 1.6.6"
|
22
|
+
spec.add_dependency "activesupport", "~> 4.2"
|
23
|
+
spec.add_dependency "stringex", "~> 2.5"
|
24
|
+
|
25
|
+
spec.add_development_dependency "bundler", "~> 1.7"
|
26
|
+
spec.add_development_dependency "rake", "~> 10.0"
|
27
|
+
spec.add_development_dependency "rspec", "~> 3.3"
|
28
|
+
end
|
@@ -0,0 +1,16 @@
|
|
1
|
+
module Buzzsaw
|
2
|
+
class Document
|
3
|
+
include Buzzsaw::DSL
|
4
|
+
attr_reader :doc
|
5
|
+
|
6
|
+
def initialize(source, format: nil)
|
7
|
+
@doc = if format == :html
|
8
|
+
Nokogiri::HTML(source)
|
9
|
+
elsif format == :xml
|
10
|
+
Nokogiri::XML(source)
|
11
|
+
else
|
12
|
+
Nokogiri.parse(source)
|
13
|
+
end
|
14
|
+
end
|
15
|
+
end
|
16
|
+
end
|
data/lib/buzzsaw/dsl.rb
ADDED
@@ -0,0 +1,247 @@
|
|
1
|
+
module Buzzsaw
|
2
|
+
module DSL
|
3
|
+
ENCODING_EXCEPTION = defined?(Java) ? Java::JavaNioCharset::UnsupportedCharsetException : Encoding::CompatibilityError
|
4
|
+
|
5
|
+
#
|
6
|
+
# Main DSL methods
|
7
|
+
#
|
8
|
+
|
9
|
+
def find_by_xpath(args)
|
10
|
+
args.symbolize_keys!
|
11
|
+
args[:match] = args[:capture] = args[:pattern] if args[:pattern]
|
12
|
+
|
13
|
+
nodes = get_nodes(args)
|
14
|
+
target = find_target_text(args, nodes)
|
15
|
+
return args[:label] if args[:label] && target.present?
|
16
|
+
asciify_target_text(target)
|
17
|
+
end
|
18
|
+
|
19
|
+
def collect_by_xpath(args)
|
20
|
+
args.symbolize_keys!
|
21
|
+
args[:match] = args[:capture] = args[:pattern] if args[:pattern]
|
22
|
+
|
23
|
+
nodes = get_nodes(args)
|
24
|
+
target = collect_target_text(args, nodes)
|
25
|
+
return args[:label] if args[:label] && target.present?
|
26
|
+
asciify_target_text(target)
|
27
|
+
end
|
28
|
+
|
29
|
+
def find_in_table(args)
|
30
|
+
args.symbolize_keys!
|
31
|
+
|
32
|
+
xpath = args[:xpath]
|
33
|
+
capture = args[:capture]
|
34
|
+
|
35
|
+
if args[:row].is_a?(Fixnum)
|
36
|
+
match_row = nil
|
37
|
+
row_index = args[:row]
|
38
|
+
else
|
39
|
+
row_index = nil
|
40
|
+
match_row = args[:row]
|
41
|
+
end
|
42
|
+
|
43
|
+
if args[:column].is_a?(Fixnum)
|
44
|
+
match_column = nil
|
45
|
+
column_index = args[:column]
|
46
|
+
else
|
47
|
+
column_index = nil
|
48
|
+
match_column = args[:column]
|
49
|
+
end
|
50
|
+
|
51
|
+
return unless table = doc.at_xpath(xpath)
|
52
|
+
|
53
|
+
# Rows match first
|
54
|
+
return unless row = match_table_element(table, "tr", match_row, row_index)
|
55
|
+
unless match_column || column_index
|
56
|
+
if capture
|
57
|
+
return row.text[capture]
|
58
|
+
else
|
59
|
+
return row.text
|
60
|
+
end
|
61
|
+
end
|
62
|
+
|
63
|
+
# Now columns
|
64
|
+
return unless col = match_table_element(row, "td", match_column, column_index)
|
65
|
+
|
66
|
+
return col.text unless capture
|
67
|
+
col.text[capture]
|
68
|
+
end
|
69
|
+
|
70
|
+
def find_by_meta_tag(args)
|
71
|
+
args.symbolize_keys!
|
72
|
+
args[:pattern] ||= args[:match] # Backwards compatibility
|
73
|
+
|
74
|
+
nodes = get_nodes_for_meta_attribute(args)
|
75
|
+
return unless target = get_content_for_meta_nodes(nodes)
|
76
|
+
target = target[args[:pattern]] if args[:pattern]
|
77
|
+
return args[:label] if args[:label] && target.present?
|
78
|
+
target
|
79
|
+
end
|
80
|
+
alias_method :label_by_meta_tag, :find_by_meta_tag
|
81
|
+
|
82
|
+
def find_by_schema_tag(value)
|
83
|
+
string_methods = [:upcase, :downcase, :capitalize]
|
84
|
+
nodes = string_methods.map do |method|
|
85
|
+
doc.at_xpath("//*[@itemprop=\"#{value.send(method)}\"]")
|
86
|
+
end.compact
|
87
|
+
return if nodes.empty?
|
88
|
+
content = nodes.first.text.strip.gsub(/\s+/," ")
|
89
|
+
return unless content.present?
|
90
|
+
content
|
91
|
+
end
|
92
|
+
|
93
|
+
def label_by_url(args)
|
94
|
+
args.symbolize_keys!
|
95
|
+
return args[:label] if "#{url}"[args[:pattern]]
|
96
|
+
end
|
97
|
+
|
98
|
+
#
|
99
|
+
# Meta tag convenience methods
|
100
|
+
#
|
101
|
+
def meta_property(args)
|
102
|
+
args.symbolize_keys!
|
103
|
+
args.merge!(attribute: 'property')
|
104
|
+
find_by_meta_tag(args)
|
105
|
+
end
|
106
|
+
|
107
|
+
def meta_name(args)
|
108
|
+
args.symbolize_keys!
|
109
|
+
args.merge!(attribute: 'name')
|
110
|
+
find_by_meta_tag(args)
|
111
|
+
end
|
112
|
+
|
113
|
+
def meta_og(value); meta_property(value: "og:#{value}"); end
|
114
|
+
|
115
|
+
def meta_title; meta_name(value: 'title'); end
|
116
|
+
def meta_keywords; meta_name(value: 'keywords'); end
|
117
|
+
def meta_description; meta_name(value: 'description'); end
|
118
|
+
def meta_image; meta_name(value: 'image'); end
|
119
|
+
def meta_price; meta_name(value: 'price'); end
|
120
|
+
|
121
|
+
def meta_og_title; meta_og('title'); end
|
122
|
+
def meta_og_keywords; meta_og('keywords'); end
|
123
|
+
def meta_og_description; meta_og('description'); end
|
124
|
+
def meta_og_image; meta_og('image'); end
|
125
|
+
|
126
|
+
def label_by_meta_keywords(args)
|
127
|
+
args.symbolize_keys!
|
128
|
+
return args[:label] if meta_keywords && meta_keywords[args[:pattern]]
|
129
|
+
end
|
130
|
+
|
131
|
+
#
|
132
|
+
# Schema.org convenience mthods
|
133
|
+
#
|
134
|
+
|
135
|
+
def schema_price; find_by_schema_tag("price"); end
|
136
|
+
def schema_name; find_by_schema_tag("name"); end
|
137
|
+
def schema_description; find_by_schema_tag("description"); end
|
138
|
+
|
139
|
+
def filter_target_text(target, filter_list)
|
140
|
+
filter_list.each do |filter|
|
141
|
+
next unless target.present?
|
142
|
+
filter.symbolize_keys! if filter.is_a?(Hash)
|
143
|
+
if filter.is_a?(String) && respond_to?(filter)
|
144
|
+
target = send(filter, target)
|
145
|
+
elsif filter[:accept]
|
146
|
+
target = target[filter[:accept]]
|
147
|
+
elsif filter[:reject]
|
148
|
+
target.slice!(filter[:reject])
|
149
|
+
elsif filter[:prefix]
|
150
|
+
target = "#{filter[:prefix]}#{target}"
|
151
|
+
elsif filter[:postfix]
|
152
|
+
target = "#{target}#{filter[:postfix]}"
|
153
|
+
end
|
154
|
+
end
|
155
|
+
target.try(:strip)
|
156
|
+
end
|
157
|
+
|
158
|
+
alias_method :filters, :filter_target_text
|
159
|
+
|
160
|
+
#
|
161
|
+
# Private
|
162
|
+
#
|
163
|
+
|
164
|
+
def match_table_element(table, element, match, index)
|
165
|
+
row = nil
|
166
|
+
row = table.xpath(".//#{element}").detect { |r| r.text && r.text[match] } if match
|
167
|
+
row ||= table.xpath(".//#{element}[#{index}]") if index
|
168
|
+
row
|
169
|
+
end
|
170
|
+
|
171
|
+
def find_target_text(args, nodes)
|
172
|
+
match_target_text!(nodes, args[:match])
|
173
|
+
|
174
|
+
# Select the first match
|
175
|
+
result = nodes.first.try(:strip)
|
176
|
+
|
177
|
+
# Filter match with the :capture regex
|
178
|
+
capture_target_text(result, args[:capture])
|
179
|
+
rescue ENCODING_EXCEPTION
|
180
|
+
end
|
181
|
+
|
182
|
+
def collect_target_text(args, nodes)
|
183
|
+
match_target_text!(nodes, args[:match])
|
184
|
+
|
185
|
+
# Reduce the matching nodes
|
186
|
+
result = join_target_text(nodes, args[:join])
|
187
|
+
|
188
|
+
# Filter the string with the :capture regex
|
189
|
+
capture_target_text(result, args[:capture])
|
190
|
+
rescue ENCODING_EXCEPTION
|
191
|
+
end
|
192
|
+
|
193
|
+
def match_target_text!(nodes, pattern)
|
194
|
+
return unless nodes.present?
|
195
|
+
nodes.select! do |node|
|
196
|
+
pattern ? node[pattern].present? : node.present?
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def capture_target_text(text, pattern)
|
201
|
+
return unless text
|
202
|
+
pattern ? text[pattern] : text.gsub(/\s+/," ")
|
203
|
+
end
|
204
|
+
|
205
|
+
def join_target_text(nodes, delimiter)
|
206
|
+
return unless nodes.present?
|
207
|
+
delimiter = delimiter.to_s
|
208
|
+
nodes.inject { |a, b| a.to_s + delimiter + b.to_s }
|
209
|
+
end
|
210
|
+
|
211
|
+
def sanitize(text)
|
212
|
+
return unless str = Sanitize.clean(text, elements: [])
|
213
|
+
HTMLEntities.new.decode(str)
|
214
|
+
end
|
215
|
+
|
216
|
+
def get_nodes(args)
|
217
|
+
nodes = doc.xpath(args[:xpath])
|
218
|
+
nodes.map(&:text).compact
|
219
|
+
end
|
220
|
+
|
221
|
+
def get_nodes_for_meta_attribute(args)
|
222
|
+
attribute = args[:attribute]
|
223
|
+
value_variations = [:upcase, :downcase, :capitalize].map { |method| args[:value].send(method) }
|
224
|
+
nodes = value_variations.map do |value|
|
225
|
+
doc.at_xpath("//head/meta[@#{attribute}=\"#{value}\"]")
|
226
|
+
end.compact
|
227
|
+
return if nodes.empty?
|
228
|
+
nodes
|
229
|
+
end
|
230
|
+
|
231
|
+
def get_content_for_meta_nodes(nodes)
|
232
|
+
return unless nodes && nodes.any?
|
233
|
+
contents = nodes.map { |node| node.attribute("content") }.compact
|
234
|
+
return if contents.empty?
|
235
|
+
content = contents.first.value.strip.squeeze(" ")
|
236
|
+
return unless content.present?
|
237
|
+
content
|
238
|
+
end
|
239
|
+
|
240
|
+
def asciify_target_text(target)
|
241
|
+
return unless target
|
242
|
+
newstr = ""
|
243
|
+
target.each_char { |chr| newstr << (chr.dump["u{e2}"] ? '"' : chr) }
|
244
|
+
newstr.to_ascii
|
245
|
+
end
|
246
|
+
end
|
247
|
+
end
|
data/lib/buzzsaw.rb
ADDED
data/spec/dsl_spec.rb
ADDED
@@ -0,0 +1,123 @@
|
|
1
|
+
require 'spec_helper'
|
2
|
+
|
3
|
+
RSpec.describe Buzzsaw::DSL do
|
4
|
+
|
5
|
+
let(:file_name) { 'sample.html' }
|
6
|
+
let(:source) {
|
7
|
+
File.open(File.join('spec', 'fixtures', 'sample.html')) { |f| f.read }
|
8
|
+
}
|
9
|
+
let(:doc) { Buzzsaw::Document.new(source, format: :html) }
|
10
|
+
|
11
|
+
describe "#find_by_xpath" do
|
12
|
+
it "finds the first matching node by xpath" do
|
13
|
+
result = doc.find_by_xpath(xpath: "//div[@class='container']//li")
|
14
|
+
expect(result).to eq("First Item")
|
15
|
+
end
|
16
|
+
|
17
|
+
it "takes a pattern argument" do
|
18
|
+
result = doc.find_by_xpath(
|
19
|
+
xpath: "//div[@class='container']//li",
|
20
|
+
pattern: /second/i
|
21
|
+
)
|
22
|
+
expect(result).to eq("Second")
|
23
|
+
end
|
24
|
+
|
25
|
+
it "takes a match argument" do
|
26
|
+
result = doc.find_by_xpath(
|
27
|
+
xpath: "//div[@class='container']//li",
|
28
|
+
match: /second/i
|
29
|
+
)
|
30
|
+
expect(result).to eq("Second Item")
|
31
|
+
end
|
32
|
+
|
33
|
+
it "takes a capture argument" do
|
34
|
+
result = doc.find_by_xpath(
|
35
|
+
xpath: "//div[@class='container']//li",
|
36
|
+
capture: /first/i
|
37
|
+
)
|
38
|
+
expect(result).to eq("First")
|
39
|
+
end
|
40
|
+
|
41
|
+
it "takes match and capture arguments together" do
|
42
|
+
result = doc.find_by_xpath(
|
43
|
+
xpath: "//div[@class='container']//li",
|
44
|
+
match: /Third/,
|
45
|
+
capture: /item/i
|
46
|
+
)
|
47
|
+
expect(result).to eq("Item")
|
48
|
+
end
|
49
|
+
|
50
|
+
it "takes a label argument" do
|
51
|
+
result = doc.find_by_xpath(
|
52
|
+
xpath: "//div[@class='container']//li",
|
53
|
+
match: /Third/,
|
54
|
+
label: "Foo"
|
55
|
+
)
|
56
|
+
expect(result).to eq("Foo")
|
57
|
+
end
|
58
|
+
end
|
59
|
+
|
60
|
+
describe "#collect_by_xpath" do
|
61
|
+
it "collects nodes by xpath" do
|
62
|
+
result = doc.collect_by_xpath(xpath: "//div[@class='container']//li")
|
63
|
+
expect(result).to eq("First ItemSecond ItemThird ItemFourth Item")
|
64
|
+
end
|
65
|
+
|
66
|
+
it "uses a join argument" do
|
67
|
+
result = doc.collect_by_xpath(
|
68
|
+
xpath: "//div[@class='container']//li",
|
69
|
+
join: "|"
|
70
|
+
)
|
71
|
+
expect(result).to eq("First Item|Second Item|Third Item|Fourth Item")
|
72
|
+
end
|
73
|
+
end
|
74
|
+
|
75
|
+
describe "#find_in_table" do
|
76
|
+
it "takes a capture argument" do
|
77
|
+
result = doc.find_in_table(
|
78
|
+
xpath: "//table",
|
79
|
+
row: 2,
|
80
|
+
capture: /row/i
|
81
|
+
)
|
82
|
+
expect(result.strip.squeeze).to eq("Row")
|
83
|
+
end
|
84
|
+
|
85
|
+
context "row argument" do
|
86
|
+
it "matches a row by number" do
|
87
|
+
result = doc.find_in_table(
|
88
|
+
xpath: "//table",
|
89
|
+
row: 2
|
90
|
+
)
|
91
|
+
expect(result.strip.squeeze).to eq("Col 1, Row 2\n Col 2, Row 2")
|
92
|
+
end
|
93
|
+
|
94
|
+
it "matches a row by pattern" do
|
95
|
+
result = doc.find_in_table(
|
96
|
+
xpath: "//table",
|
97
|
+
row: /Col 1\, Row 2/
|
98
|
+
)
|
99
|
+
expect(result.strip.squeeze).to eq("Col 1, Row 2\n Col 2, Row 2")
|
100
|
+
end
|
101
|
+
end
|
102
|
+
|
103
|
+
context "column argument" do
|
104
|
+
it "matches a column by number" do
|
105
|
+
result = doc.find_in_table(
|
106
|
+
xpath: "//table",
|
107
|
+
row: 2,
|
108
|
+
column: 2
|
109
|
+
)
|
110
|
+
expect(result.strip.squeeze).to eq("Col 2, Row 2")
|
111
|
+
end
|
112
|
+
|
113
|
+
it "matches a column by pattern" do
|
114
|
+
result = doc.find_in_table(
|
115
|
+
xpath: "//table",
|
116
|
+
row: 2,
|
117
|
+
column: /Col 1/
|
118
|
+
)
|
119
|
+
expect(result.strip.squeeze).to eq("Col 1, Row 2")
|
120
|
+
end
|
121
|
+
end
|
122
|
+
end
|
123
|
+
end
|
@@ -0,0 +1,34 @@
|
|
1
|
+
<!DOCTYPE html>
|
2
|
+
<html lang="en">
|
3
|
+
<head>
|
4
|
+
<title>Sample HTML document</title>
|
5
|
+
|
6
|
+
</head>
|
7
|
+
|
8
|
+
<body>
|
9
|
+
<div class="container">
|
10
|
+
<ul class="list">
|
11
|
+
<li>First Item</li>
|
12
|
+
<li>Second Item</li>
|
13
|
+
</ul>
|
14
|
+
</div>
|
15
|
+
|
16
|
+
<div class="container">
|
17
|
+
<ul class="list">
|
18
|
+
<li>Third Item</li>
|
19
|
+
<li>Fourth Item</li>
|
20
|
+
</ul>
|
21
|
+
</div>
|
22
|
+
|
23
|
+
<table>
|
24
|
+
<tr>
|
25
|
+
<td>Col 1, Row 1</td>
|
26
|
+
<td>Col 2, Row 1</td>
|
27
|
+
</tr>
|
28
|
+
<tr>
|
29
|
+
<td>Col 1, Row 2</td>
|
30
|
+
<td>Col 2, Row 2</td>
|
31
|
+
</tr>
|
32
|
+
</table>
|
33
|
+
</body>
|
34
|
+
</html>
|
data/spec/spec_helper.rb
ADDED
metadata
ADDED
@@ -0,0 +1,158 @@
|
|
1
|
+
--- !ruby/object:Gem::Specification
|
2
|
+
name: buzzsaw
|
3
|
+
version: !ruby/object:Gem::Version
|
4
|
+
version: 0.0.1
|
5
|
+
platform: ruby
|
6
|
+
authors:
|
7
|
+
- Jon Stokes
|
8
|
+
autorequire:
|
9
|
+
bindir: bin
|
10
|
+
cert_chain: []
|
11
|
+
date: 2015-08-01 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: htmlentities
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - "~>"
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '4.3'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - "~>"
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '4.3'
|
27
|
+
- !ruby/object:Gem::Dependency
|
28
|
+
name: nokogiri
|
29
|
+
requirement: !ruby/object:Gem::Requirement
|
30
|
+
requirements:
|
31
|
+
- - "~>"
|
32
|
+
- !ruby/object:Gem::Version
|
33
|
+
version: 1.6.6
|
34
|
+
type: :runtime
|
35
|
+
prerelease: false
|
36
|
+
version_requirements: !ruby/object:Gem::Requirement
|
37
|
+
requirements:
|
38
|
+
- - "~>"
|
39
|
+
- !ruby/object:Gem::Version
|
40
|
+
version: 1.6.6
|
41
|
+
- !ruby/object:Gem::Dependency
|
42
|
+
name: activesupport
|
43
|
+
requirement: !ruby/object:Gem::Requirement
|
44
|
+
requirements:
|
45
|
+
- - "~>"
|
46
|
+
- !ruby/object:Gem::Version
|
47
|
+
version: '4.2'
|
48
|
+
type: :runtime
|
49
|
+
prerelease: false
|
50
|
+
version_requirements: !ruby/object:Gem::Requirement
|
51
|
+
requirements:
|
52
|
+
- - "~>"
|
53
|
+
- !ruby/object:Gem::Version
|
54
|
+
version: '4.2'
|
55
|
+
- !ruby/object:Gem::Dependency
|
56
|
+
name: stringex
|
57
|
+
requirement: !ruby/object:Gem::Requirement
|
58
|
+
requirements:
|
59
|
+
- - "~>"
|
60
|
+
- !ruby/object:Gem::Version
|
61
|
+
version: '2.5'
|
62
|
+
type: :runtime
|
63
|
+
prerelease: false
|
64
|
+
version_requirements: !ruby/object:Gem::Requirement
|
65
|
+
requirements:
|
66
|
+
- - "~>"
|
67
|
+
- !ruby/object:Gem::Version
|
68
|
+
version: '2.5'
|
69
|
+
- !ruby/object:Gem::Dependency
|
70
|
+
name: bundler
|
71
|
+
requirement: !ruby/object:Gem::Requirement
|
72
|
+
requirements:
|
73
|
+
- - "~>"
|
74
|
+
- !ruby/object:Gem::Version
|
75
|
+
version: '1.7'
|
76
|
+
type: :development
|
77
|
+
prerelease: false
|
78
|
+
version_requirements: !ruby/object:Gem::Requirement
|
79
|
+
requirements:
|
80
|
+
- - "~>"
|
81
|
+
- !ruby/object:Gem::Version
|
82
|
+
version: '1.7'
|
83
|
+
- !ruby/object:Gem::Dependency
|
84
|
+
name: rake
|
85
|
+
requirement: !ruby/object:Gem::Requirement
|
86
|
+
requirements:
|
87
|
+
- - "~>"
|
88
|
+
- !ruby/object:Gem::Version
|
89
|
+
version: '10.0'
|
90
|
+
type: :development
|
91
|
+
prerelease: false
|
92
|
+
version_requirements: !ruby/object:Gem::Requirement
|
93
|
+
requirements:
|
94
|
+
- - "~>"
|
95
|
+
- !ruby/object:Gem::Version
|
96
|
+
version: '10.0'
|
97
|
+
- !ruby/object:Gem::Dependency
|
98
|
+
name: rspec
|
99
|
+
requirement: !ruby/object:Gem::Requirement
|
100
|
+
requirements:
|
101
|
+
- - "~>"
|
102
|
+
- !ruby/object:Gem::Version
|
103
|
+
version: '3.3'
|
104
|
+
type: :development
|
105
|
+
prerelease: false
|
106
|
+
version_requirements: !ruby/object:Gem::Requirement
|
107
|
+
requirements:
|
108
|
+
- - "~>"
|
109
|
+
- !ruby/object:Gem::Version
|
110
|
+
version: '3.3'
|
111
|
+
description:
|
112
|
+
email:
|
113
|
+
- jon@jonstokes.com
|
114
|
+
executables: []
|
115
|
+
extensions: []
|
116
|
+
extra_rdoc_files: []
|
117
|
+
files:
|
118
|
+
- ".gitignore"
|
119
|
+
- Gemfile
|
120
|
+
- LICENSE.txt
|
121
|
+
- README.md
|
122
|
+
- Rakefile
|
123
|
+
- buzzsaw.gemspec
|
124
|
+
- lib/buzzsaw.rb
|
125
|
+
- lib/buzzsaw/document.rb
|
126
|
+
- lib/buzzsaw/dsl.rb
|
127
|
+
- lib/buzzsaw/version.rb
|
128
|
+
- spec/dsl_spec.rb
|
129
|
+
- spec/fixtures/sample.html
|
130
|
+
- spec/spec_helper.rb
|
131
|
+
homepage: ''
|
132
|
+
licenses:
|
133
|
+
- MIT
|
134
|
+
metadata: {}
|
135
|
+
post_install_message:
|
136
|
+
rdoc_options: []
|
137
|
+
require_paths:
|
138
|
+
- lib
|
139
|
+
required_ruby_version: !ruby/object:Gem::Requirement
|
140
|
+
requirements:
|
141
|
+
- - ">="
|
142
|
+
- !ruby/object:Gem::Version
|
143
|
+
version: '0'
|
144
|
+
required_rubygems_version: !ruby/object:Gem::Requirement
|
145
|
+
requirements:
|
146
|
+
- - ">="
|
147
|
+
- !ruby/object:Gem::Version
|
148
|
+
version: '0'
|
149
|
+
requirements: []
|
150
|
+
rubyforge_project:
|
151
|
+
rubygems_version: 2.4.6
|
152
|
+
signing_key:
|
153
|
+
specification_version: 4
|
154
|
+
summary: A web scraping DSL built on Nokogiri
|
155
|
+
test_files:
|
156
|
+
- spec/dsl_spec.rb
|
157
|
+
- spec/fixtures/sample.html
|
158
|
+
- spec/spec_helper.rb
|