yasf 0.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +19 -0
- data/.rvmrc +52 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +22 -0
- data/README.md +45 -0
- data/Rakefile +7 -0
- data/lib/yasf/scraper.rb +144 -0
- data/lib/yasf/version.rb +3 -0
- data/lib/yasf.rb +15 -0
- data/spec/fixtures/.gitkeep +0 -0
- data/spec/fixtures/advanced_example_response +41 -0
- data/spec/fixtures/basic_example_response +10 -0
- data/spec/fixtures/medium_example_response +13 -0
- data/spec/fixtures/thepiratebay_response.html +510 -0
- data/spec/lib/yasf/.gitkeep +0 -0
- data/spec/lib/yasf/scraper_spec.rb +18 -0
- data/spec/lib/yasf_spec.rb +100 -0
- data/spec/spec_helper.rb +24 -0
- data/yasf.gemspec +25 -0
- metadata +118 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
+
# development environment upon cd'ing into the directory
|
5
|
+
|
6
|
+
# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
|
7
|
+
# Only full ruby name is supported here, for short names use:
|
8
|
+
# echo "rvm use 1.9.3" > .rvmrc
|
9
|
+
environment_id="ruby-1.9.3@yasf"
|
10
|
+
|
11
|
+
# Uncomment the following lines if you want to verify rvm version per project
|
12
|
+
# rvmrc_rvm_version="1.15.9 ()" # 1.10.1 seams as a safe start
|
13
|
+
# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
|
14
|
+
# echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
|
15
|
+
# return 1
|
16
|
+
# }
|
17
|
+
|
18
|
+
# First we attempt to load the desired environment directly from the environment
|
19
|
+
# file. This is very fast and efficient compared to running through the entire
|
20
|
+
# CLI and selector. If you want feedback on which environment was used then
|
21
|
+
# insert the word 'use' after --create as this triggers verbose mode.
|
22
|
+
if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
|
23
|
+
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
|
24
|
+
then
|
25
|
+
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
26
|
+
[[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
|
27
|
+
\. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
|
28
|
+
if [[ $- == *i* ]] # check for interactive shells
|
29
|
+
then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
|
30
|
+
else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
|
31
|
+
fi
|
32
|
+
else
|
33
|
+
# If the environment file has not yet been created, use the RVM CLI to select.
|
34
|
+
rvm --create use "$environment_id" || {
|
35
|
+
echo "Failed to create RVM environment '${environment_id}'."
|
36
|
+
return 1
|
37
|
+
}
|
38
|
+
fi
|
39
|
+
|
40
|
+
# If you use bundler, this might be useful to you:
|
41
|
+
# if [[ -s Gemfile ]] && {
|
42
|
+
# ! builtin command -v bundle >/dev/null ||
|
43
|
+
# builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
|
44
|
+
# }
|
45
|
+
# then
|
46
|
+
# printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
|
47
|
+
# gem install bundler
|
48
|
+
# fi
|
49
|
+
# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
|
50
|
+
# then
|
51
|
+
# bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
|
52
|
+
# fi
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
yasf (0.0.1)
|
5
|
+
nokogiri (= 1.5.5)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.1.3)
|
11
|
+
fakeweb (1.3.0)
|
12
|
+
nokogiri (1.5.5)
|
13
|
+
rake (0.9.2.2)
|
14
|
+
rspec (2.11.0)
|
15
|
+
rspec-core (~> 2.11.0)
|
16
|
+
rspec-expectations (~> 2.11.0)
|
17
|
+
rspec-mocks (~> 2.11.0)
|
18
|
+
rspec-core (2.11.1)
|
19
|
+
rspec-expectations (2.11.3)
|
20
|
+
diff-lcs (~> 1.1.3)
|
21
|
+
rspec-mocks (2.11.3)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
fakeweb
|
28
|
+
rake
|
29
|
+
rspec
|
30
|
+
yasf!
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Algonauti
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
yasf
|
2
|
+
====
|
3
|
+
|
4
|
+
Yet Another Scraper Framework
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'yasf'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install yasf
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
row_scraper = Yasf.define do
|
22
|
+
scrape "h1.title", :title => :text
|
23
|
+
scrape "a.brand", :brand => :text, :brand_link => :href
|
24
|
+
|
25
|
+
result :title, :brand, :brand_link
|
26
|
+
end
|
27
|
+
|
28
|
+
scraper = Yasf.define do
|
29
|
+
scrape "table.companies tr.company", :'rows[]' => row_scraper
|
30
|
+
result :rows
|
31
|
+
end
|
32
|
+
|
33
|
+
###And using the scraper:
|
34
|
+
url = "http://local.domain"
|
35
|
+
results = scraper.extract_from(url)
|
36
|
+
result = results.first
|
37
|
+
puts result.title
|
38
|
+
|
39
|
+
## Contributing
|
40
|
+
|
41
|
+
1. Fork it
|
42
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
43
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
44
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
45
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/lib/yasf/scraper.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module Yasf
|
4
|
+
class Scraper
|
5
|
+
|
6
|
+
class << self
|
7
|
+
|
8
|
+
def extract_from(source, options = nil)
|
9
|
+
self.new(source, options).extract
|
10
|
+
end
|
11
|
+
|
12
|
+
# Defines a processing rule.
|
13
|
+
def scrape(*args)
|
14
|
+
name = args.shift if args.first.is_a?(Symbol)
|
15
|
+
if args.last.is_a?(Hash)
|
16
|
+
extractor = extractor(args.pop)
|
17
|
+
end
|
18
|
+
raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
|
19
|
+
raise ArgumentError, "Missing selector: the first argument tells us what to select" if args.empty?
|
20
|
+
define_method :__extractor, extractor
|
21
|
+
method = instance_method(:__extractor)
|
22
|
+
remove_method :__extractor
|
23
|
+
rules << [args.pop, method, name]
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns an array of scraper rules
|
27
|
+
def rules()
|
28
|
+
@rules ||= []
|
29
|
+
end
|
30
|
+
|
31
|
+
def result(*symbols)
|
32
|
+
raise ArgumentError, "one symbol to return the value of this accessor" if symbols.empty?
|
33
|
+
symbols = symbols.map {|s| s.to_sym}
|
34
|
+
if symbols.size == 1
|
35
|
+
define_method :result do
|
36
|
+
return self.send(symbols[0])
|
37
|
+
end
|
38
|
+
else
|
39
|
+
struct = Struct.new(*symbols)
|
40
|
+
define_method :result do
|
41
|
+
return struct.new(*symbols.collect {|s| self.send(s) })
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Creates an extractor that will extract values from the selected
|
47
|
+
# element and place them in instance variables of the scraper.
|
48
|
+
def extractor(map)
|
49
|
+
extracts = []
|
50
|
+
map.each_pair do |target, source|
|
51
|
+
source = extract_value_from(source)
|
52
|
+
target = extract_value_to(target)
|
53
|
+
define_method :__extractor do |element|
|
54
|
+
value = source.call(element)
|
55
|
+
target.call(self, value) unless value.nil?
|
56
|
+
end
|
57
|
+
extracts << instance_method(:__extractor)
|
58
|
+
remove_method :__extractor
|
59
|
+
end
|
60
|
+
lambda do |element|
|
61
|
+
extracts.each do |extract|
|
62
|
+
extract.bind(self).call(element)
|
63
|
+
end
|
64
|
+
true
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
# Returns a Proc that will extract a value from an element.
|
71
|
+
def extract_value_from(source)
|
72
|
+
case source
|
73
|
+
when Class
|
74
|
+
unless source.ancestors.include?(Yasf::Scraper)
|
75
|
+
raise ArgumentError, "Class must extends Yasf::Scraper"
|
76
|
+
end
|
77
|
+
return lambda { |element| source.new(element).extract }
|
78
|
+
when Symbol
|
79
|
+
return lambda do |element|
|
80
|
+
if element.respond_to?(source)
|
81
|
+
element.send(source)
|
82
|
+
elsif element.respond_to?("[]", source)
|
83
|
+
element.send("[]", source)
|
84
|
+
else
|
85
|
+
raise ArgumentError, "Method not found"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Returns a Proc that will set the extract value in the object.
|
92
|
+
def extract_value_to(target)
|
93
|
+
method_name = target.to_s.tr_s("[]", "")
|
94
|
+
|
95
|
+
attr_accessor method_name
|
96
|
+
|
97
|
+
if target.to_s.end_with? "[]"
|
98
|
+
reader = "#{method_name}".to_sym
|
99
|
+
writer = "#{method_name}=".to_sym
|
100
|
+
return lambda do |object, value|
|
101
|
+
array = object.send(reader)
|
102
|
+
object.send(writer, array = []) unless array
|
103
|
+
array << value
|
104
|
+
end
|
105
|
+
else
|
106
|
+
reader = "#{method_name}=".to_sym
|
107
|
+
return lambda { |object, value| object.send(reader, value) }
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end # end self
|
113
|
+
|
114
|
+
# The argument +source+ is a String (url format), or Nokogiri::XML::Element
|
115
|
+
def initialize(source, options = nil)
|
116
|
+
@options = options || {}
|
117
|
+
case source
|
118
|
+
when String
|
119
|
+
@document = Nokogiri::HTML(open(source))
|
120
|
+
when Nokogiri::XML::Element
|
121
|
+
@document = source
|
122
|
+
else
|
123
|
+
raise ArgumentError, "source not recognized"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns the document being processed.
|
128
|
+
def document
|
129
|
+
@document
|
130
|
+
end
|
131
|
+
|
132
|
+
# Scrapes the document and returns the result.
|
133
|
+
def extract
|
134
|
+
rules = self.class.rules.clone
|
135
|
+
rules.delete_if do |selector, extractor, rule_name|
|
136
|
+
document.search(selector).each do |element|
|
137
|
+
extractor.bind(self).call(element)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
return result
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
data/lib/yasf/version.rb
ADDED
data/lib/yasf.rb
ADDED
File without changes
|
@@ -0,0 +1,41 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>FakePage</title>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
<table>
|
8
|
+
<tr class="tr_with_title">
|
9
|
+
<td>
|
10
|
+
<h1 class="title_under_table">Title 1</h1>
|
11
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 1</a>
|
12
|
+
</td>
|
13
|
+
</tr>
|
14
|
+
<tr class="tr_with_title">
|
15
|
+
<td>
|
16
|
+
<h1 class="title_under_table">Title 2</h1>
|
17
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 2</a>
|
18
|
+
</td>
|
19
|
+
</tr>
|
20
|
+
<tr class="tr_with_title">
|
21
|
+
<td>
|
22
|
+
<h1 class="title_under_table">Title 3</h1>
|
23
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 3</a>
|
24
|
+
</td>
|
25
|
+
</tr>
|
26
|
+
<tr class="tr_with_title">
|
27
|
+
<td>
|
28
|
+
<h1 class="title_under_table">Title 4</h1>
|
29
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 4</a>
|
30
|
+
</td>
|
31
|
+
</tr>
|
32
|
+
<tr class="tr_with_title">
|
33
|
+
<td>
|
34
|
+
<h1 class="title_under_table">Title 5</h1>
|
35
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 5</a>
|
36
|
+
</td>
|
37
|
+
</tr>
|
38
|
+
</table>
|
39
|
+
</body>
|
40
|
+
</html>
|
41
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>FakePage</title>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
<h1 class="title">Title 1</h1>
|
8
|
+
<h1 class="title">Title 2</h1>
|
9
|
+
<h1 class="title">Title 3</h1>
|
10
|
+
<h1 class="title">Title 4</h1>
|
11
|
+
</body>
|
12
|
+
</html>
|
13
|
+
|