yasf 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +19 -0
- data/.rvmrc +52 -0
- data/Gemfile +4 -0
- data/Gemfile.lock +30 -0
- data/LICENSE.txt +22 -0
- data/README.md +45 -0
- data/Rakefile +7 -0
- data/lib/yasf/scraper.rb +144 -0
- data/lib/yasf/version.rb +3 -0
- data/lib/yasf.rb +15 -0
- data/spec/fixtures/.gitkeep +0 -0
- data/spec/fixtures/advanced_example_response +41 -0
- data/spec/fixtures/basic_example_response +10 -0
- data/spec/fixtures/medium_example_response +13 -0
- data/spec/fixtures/thepiratebay_response.html +510 -0
- data/spec/lib/yasf/.gitkeep +0 -0
- data/spec/lib/yasf/scraper_spec.rb +18 -0
- data/spec/lib/yasf_spec.rb +100 -0
- data/spec/spec_helper.rb +24 -0
- data/yasf.gemspec +25 -0
- metadata +118 -0
data/.gitignore
ADDED
data/.rvmrc
ADDED
@@ -0,0 +1,52 @@
|
|
1
|
+
#!/usr/bin/env bash
|
2
|
+
|
3
|
+
# This is an RVM Project .rvmrc file, used to automatically load the ruby
|
4
|
+
# development environment upon cd'ing into the directory
|
5
|
+
|
6
|
+
# First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
|
7
|
+
# Only full ruby name is supported here, for short names use:
|
8
|
+
# echo "rvm use 1.9.3" > .rvmrc
|
9
|
+
environment_id="ruby-1.9.3@yasf"
|
10
|
+
|
11
|
+
# Uncomment the following lines if you want to verify rvm version per project
|
12
|
+
# rvmrc_rvm_version="1.15.9 ()" # 1.10.1 seams as a safe start
|
13
|
+
# eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
|
14
|
+
# echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
|
15
|
+
# return 1
|
16
|
+
# }
|
17
|
+
|
18
|
+
# First we attempt to load the desired environment directly from the environment
|
19
|
+
# file. This is very fast and efficient compared to running through the entire
|
20
|
+
# CLI and selector. If you want feedback on which environment was used then
|
21
|
+
# insert the word 'use' after --create as this triggers verbose mode.
|
22
|
+
if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
|
23
|
+
&& -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
|
24
|
+
then
|
25
|
+
\. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
|
26
|
+
[[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
|
27
|
+
\. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
|
28
|
+
if [[ $- == *i* ]] # check for interactive shells
|
29
|
+
then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
|
30
|
+
else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
|
31
|
+
fi
|
32
|
+
else
|
33
|
+
# If the environment file has not yet been created, use the RVM CLI to select.
|
34
|
+
rvm --create use "$environment_id" || {
|
35
|
+
echo "Failed to create RVM environment '${environment_id}'."
|
36
|
+
return 1
|
37
|
+
}
|
38
|
+
fi
|
39
|
+
|
40
|
+
# If you use bundler, this might be useful to you:
|
41
|
+
# if [[ -s Gemfile ]] && {
|
42
|
+
# ! builtin command -v bundle >/dev/null ||
|
43
|
+
# builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
|
44
|
+
# }
|
45
|
+
# then
|
46
|
+
# printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
|
47
|
+
# gem install bundler
|
48
|
+
# fi
|
49
|
+
# if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
|
50
|
+
# then
|
51
|
+
# bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
|
52
|
+
# fi
|
data/Gemfile
ADDED
data/Gemfile.lock
ADDED
@@ -0,0 +1,30 @@
|
|
1
|
+
PATH
|
2
|
+
remote: .
|
3
|
+
specs:
|
4
|
+
yasf (0.0.1)
|
5
|
+
nokogiri (= 1.5.5)
|
6
|
+
|
7
|
+
GEM
|
8
|
+
remote: http://rubygems.org/
|
9
|
+
specs:
|
10
|
+
diff-lcs (1.1.3)
|
11
|
+
fakeweb (1.3.0)
|
12
|
+
nokogiri (1.5.5)
|
13
|
+
rake (0.9.2.2)
|
14
|
+
rspec (2.11.0)
|
15
|
+
rspec-core (~> 2.11.0)
|
16
|
+
rspec-expectations (~> 2.11.0)
|
17
|
+
rspec-mocks (~> 2.11.0)
|
18
|
+
rspec-core (2.11.1)
|
19
|
+
rspec-expectations (2.11.3)
|
20
|
+
diff-lcs (~> 1.1.3)
|
21
|
+
rspec-mocks (2.11.3)
|
22
|
+
|
23
|
+
PLATFORMS
|
24
|
+
ruby
|
25
|
+
|
26
|
+
DEPENDENCIES
|
27
|
+
fakeweb
|
28
|
+
rake
|
29
|
+
rspec
|
30
|
+
yasf!
|
data/LICENSE.txt
ADDED
@@ -0,0 +1,22 @@
|
|
1
|
+
Copyright (c) 2012 Algonauti
|
2
|
+
|
3
|
+
MIT License
|
4
|
+
|
5
|
+
Permission is hereby granted, free of charge, to any person obtaining
|
6
|
+
a copy of this software and associated documentation files (the
|
7
|
+
"Software"), to deal in the Software without restriction, including
|
8
|
+
without limitation the rights to use, copy, modify, merge, publish,
|
9
|
+
distribute, sublicense, and/or sell copies of the Software, and to
|
10
|
+
permit persons to whom the Software is furnished to do so, subject to
|
11
|
+
the following conditions:
|
12
|
+
|
13
|
+
The above copyright notice and this permission notice shall be
|
14
|
+
included in all copies or substantial portions of the Software.
|
15
|
+
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
18
|
+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
19
|
+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
20
|
+
LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
21
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
22
|
+
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
data/README.md
ADDED
@@ -0,0 +1,45 @@
|
|
1
|
+
yasf
|
2
|
+
====
|
3
|
+
|
4
|
+
Yet Another Scraper Framework
|
5
|
+
|
6
|
+
## Installation
|
7
|
+
|
8
|
+
Add this line to your application's Gemfile:
|
9
|
+
|
10
|
+
gem 'yasf'
|
11
|
+
|
12
|
+
And then execute:
|
13
|
+
|
14
|
+
$ bundle
|
15
|
+
|
16
|
+
Or install it yourself as:
|
17
|
+
|
18
|
+
$ gem install yasf
|
19
|
+
|
20
|
+
## Usage
|
21
|
+
row_scraper = Yasf.define do
|
22
|
+
scrape "h1.title", :title => :text
|
23
|
+
scrape "a.brand", :brand => :text, :brand_link => :href
|
24
|
+
|
25
|
+
result :title, :brand, :brand_link
|
26
|
+
end
|
27
|
+
|
28
|
+
scraper = Yasf.define do
|
29
|
+
scrape "table.companies tr.company", :'rows[]' => row_scraper
|
30
|
+
result :rows
|
31
|
+
end
|
32
|
+
|
33
|
+
###And using the scraper:
|
34
|
+
url = "http://local.domain"
|
35
|
+
results = scraper.extract_from(url)
|
36
|
+
result = results.first
|
37
|
+
puts result.title
|
38
|
+
|
39
|
+
## Contributing
|
40
|
+
|
41
|
+
1. Fork it
|
42
|
+
2. Create your feature branch (`git checkout -b my-new-feature`)
|
43
|
+
3. Commit your changes (`git commit -am 'Add some feature'`)
|
44
|
+
4. Push to the branch (`git push origin my-new-feature`)
|
45
|
+
5. Create new Pull Request
|
data/Rakefile
ADDED
data/lib/yasf/scraper.rb
ADDED
@@ -0,0 +1,144 @@
|
|
1
|
+
require "nokogiri"
|
2
|
+
|
3
|
+
module Yasf
|
4
|
+
class Scraper
|
5
|
+
|
6
|
+
class << self
|
7
|
+
|
8
|
+
def extract_from(source, options = nil)
|
9
|
+
self.new(source, options).extract
|
10
|
+
end
|
11
|
+
|
12
|
+
# Defines a processing rule.
|
13
|
+
def scrape(*args)
|
14
|
+
name = args.shift if args.first.is_a?(Symbol)
|
15
|
+
if args.last.is_a?(Hash)
|
16
|
+
extractor = extractor(args.pop)
|
17
|
+
end
|
18
|
+
raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
|
19
|
+
raise ArgumentError, "Missing selector: the first argument tells us what to select" if args.empty?
|
20
|
+
define_method :__extractor, extractor
|
21
|
+
method = instance_method(:__extractor)
|
22
|
+
remove_method :__extractor
|
23
|
+
rules << [args.pop, method, name]
|
24
|
+
end
|
25
|
+
|
26
|
+
# Returns an array of scraper rules
|
27
|
+
def rules()
|
28
|
+
@rules ||= []
|
29
|
+
end
|
30
|
+
|
31
|
+
def result(*symbols)
|
32
|
+
raise ArgumentError, "one symbol to return the value of this accessor" if symbols.empty?
|
33
|
+
symbols = symbols.map {|s| s.to_sym}
|
34
|
+
if symbols.size == 1
|
35
|
+
define_method :result do
|
36
|
+
return self.send(symbols[0])
|
37
|
+
end
|
38
|
+
else
|
39
|
+
struct = Struct.new(*symbols)
|
40
|
+
define_method :result do
|
41
|
+
return struct.new(*symbols.collect {|s| self.send(s) })
|
42
|
+
end
|
43
|
+
end
|
44
|
+
end
|
45
|
+
|
46
|
+
# Creates an extractor that will extract values from the selected
|
47
|
+
# element and place them in instance variables of the scraper.
|
48
|
+
def extractor(map)
|
49
|
+
extracts = []
|
50
|
+
map.each_pair do |target, source|
|
51
|
+
source = extract_value_from(source)
|
52
|
+
target = extract_value_to(target)
|
53
|
+
define_method :__extractor do |element|
|
54
|
+
value = source.call(element)
|
55
|
+
target.call(self, value) unless value.nil?
|
56
|
+
end
|
57
|
+
extracts << instance_method(:__extractor)
|
58
|
+
remove_method :__extractor
|
59
|
+
end
|
60
|
+
lambda do |element|
|
61
|
+
extracts.each do |extract|
|
62
|
+
extract.bind(self).call(element)
|
63
|
+
end
|
64
|
+
true
|
65
|
+
end
|
66
|
+
end
|
67
|
+
|
68
|
+
private
|
69
|
+
|
70
|
+
# Returns a Proc that will extract a value from an element.
|
71
|
+
def extract_value_from(source)
|
72
|
+
case source
|
73
|
+
when Class
|
74
|
+
unless source.ancestors.include?(Yasf::Scraper)
|
75
|
+
raise ArgumentError, "Class must extends Yasf::Scraper"
|
76
|
+
end
|
77
|
+
return lambda { |element| source.new(element).extract }
|
78
|
+
when Symbol
|
79
|
+
return lambda do |element|
|
80
|
+
if element.respond_to?(source)
|
81
|
+
element.send(source)
|
82
|
+
elsif element.respond_to?("[]", source)
|
83
|
+
element.send("[]", source)
|
84
|
+
else
|
85
|
+
raise ArgumentError, "Method not found"
|
86
|
+
end
|
87
|
+
end
|
88
|
+
end
|
89
|
+
end
|
90
|
+
|
91
|
+
# Returns a Proc that will set the extract value in the object.
|
92
|
+
def extract_value_to(target)
|
93
|
+
method_name = target.to_s.tr_s("[]", "")
|
94
|
+
|
95
|
+
attr_accessor method_name
|
96
|
+
|
97
|
+
if target.to_s.end_with? "[]"
|
98
|
+
reader = "#{method_name}".to_sym
|
99
|
+
writer = "#{method_name}=".to_sym
|
100
|
+
return lambda do |object, value|
|
101
|
+
array = object.send(reader)
|
102
|
+
object.send(writer, array = []) unless array
|
103
|
+
array << value
|
104
|
+
end
|
105
|
+
else
|
106
|
+
reader = "#{method_name}=".to_sym
|
107
|
+
return lambda { |object, value| object.send(reader, value) }
|
108
|
+
end
|
109
|
+
|
110
|
+
end
|
111
|
+
|
112
|
+
end # end self
|
113
|
+
|
114
|
+
# The argument +source+ is a String (url format), or Nokogiri::XML::Element
|
115
|
+
def initialize(source, options = nil)
|
116
|
+
@options = options || {}
|
117
|
+
case source
|
118
|
+
when String
|
119
|
+
@document = Nokogiri::HTML(open(source))
|
120
|
+
when Nokogiri::XML::Element
|
121
|
+
@document = source
|
122
|
+
else
|
123
|
+
raise ArgumentError, "source not recognized"
|
124
|
+
end
|
125
|
+
end
|
126
|
+
|
127
|
+
# Returns the document being processed.
|
128
|
+
def document
|
129
|
+
@document
|
130
|
+
end
|
131
|
+
|
132
|
+
# Scrapes the document and returns the result.
|
133
|
+
def extract
|
134
|
+
rules = self.class.rules.clone
|
135
|
+
rules.delete_if do |selector, extractor, rule_name|
|
136
|
+
document.search(selector).each do |element|
|
137
|
+
extractor.bind(self).call(element)
|
138
|
+
end
|
139
|
+
end
|
140
|
+
return result
|
141
|
+
end
|
142
|
+
|
143
|
+
end
|
144
|
+
end
|
data/lib/yasf/version.rb
ADDED
data/lib/yasf.rb
ADDED
File without changes
|
@@ -0,0 +1,41 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>FakePage</title>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
<table>
|
8
|
+
<tr class="tr_with_title">
|
9
|
+
<td>
|
10
|
+
<h1 class="title_under_table">Title 1</h1>
|
11
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 1</a>
|
12
|
+
</td>
|
13
|
+
</tr>
|
14
|
+
<tr class="tr_with_title">
|
15
|
+
<td>
|
16
|
+
<h1 class="title_under_table">Title 2</h1>
|
17
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 2</a>
|
18
|
+
</td>
|
19
|
+
</tr>
|
20
|
+
<tr class="tr_with_title">
|
21
|
+
<td>
|
22
|
+
<h1 class="title_under_table">Title 3</h1>
|
23
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 3</a>
|
24
|
+
</td>
|
25
|
+
</tr>
|
26
|
+
<tr class="tr_with_title">
|
27
|
+
<td>
|
28
|
+
<h1 class="title_under_table">Title 4</h1>
|
29
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 4</a>
|
30
|
+
</td>
|
31
|
+
</tr>
|
32
|
+
<tr class="tr_with_title">
|
33
|
+
<td>
|
34
|
+
<h1 class="title_under_table">Title 5</h1>
|
35
|
+
<a href="http://linkto.title.one" class="title_under_table">Link Title 5</a>
|
36
|
+
</td>
|
37
|
+
</tr>
|
38
|
+
</table>
|
39
|
+
</body>
|
40
|
+
</html>
|
41
|
+
|
@@ -0,0 +1,13 @@
|
|
1
|
+
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
|
2
|
+
<html>
|
3
|
+
<head>
|
4
|
+
<title>FakePage</title>
|
5
|
+
</head>
|
6
|
+
<body>
|
7
|
+
<h1 class="title">Title 1</h1>
|
8
|
+
<h1 class="title">Title 2</h1>
|
9
|
+
<h1 class="title">Title 3</h1>
|
10
|
+
<h1 class="title">Title 4</h1>
|
11
|
+
</body>
|
12
|
+
</html>
|
13
|
+
|