yasf 0.0.3

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ *.swp
4
+ .bundle
5
+ .config
6
+ coverage
7
+ InstalledFiles
8
+ lib/bundler/man
9
+ pkg
10
+ rdoc
11
+ spec/reports
12
+ test/tmp
13
+ test/version_tmp
14
+ tmp
15
+
16
+ # YARD artifacts
17
+ .yardoc
18
+ _yardoc
19
+ doc/
data/.rvmrc ADDED
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 1.9.3" > .rvmrc
9
+ environment_id="ruby-1.9.3@yasf"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.15.9 ()" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
27
+ \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
28
+ if [[ $- == *i* ]] # check for interactive shells
29
+ then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
30
+ else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
31
+ fi
32
+ else
33
+ # If the environment file has not yet been created, use the RVM CLI to select.
34
+ rvm --create use "$environment_id" || {
35
+ echo "Failed to create RVM environment '${environment_id}'."
36
+ return 1
37
+ }
38
+ fi
39
+
40
+ # If you use bundler, this might be useful to you:
41
+ # if [[ -s Gemfile ]] && {
42
+ # ! builtin command -v bundle >/dev/null ||
43
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
44
+ # }
45
+ # then
46
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
47
+ # gem install bundler
48
+ # fi
49
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
50
+ # then
51
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
52
+ # fi
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in yasf.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,30 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ yasf (0.0.1)
5
+ nokogiri (= 1.5.5)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ diff-lcs (1.1.3)
11
+ fakeweb (1.3.0)
12
+ nokogiri (1.5.5)
13
+ rake (0.9.2.2)
14
+ rspec (2.11.0)
15
+ rspec-core (~> 2.11.0)
16
+ rspec-expectations (~> 2.11.0)
17
+ rspec-mocks (~> 2.11.0)
18
+ rspec-core (2.11.1)
19
+ rspec-expectations (2.11.3)
20
+ diff-lcs (~> 1.1.3)
21
+ rspec-mocks (2.11.3)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ fakeweb
28
+ rake
29
+ rspec
30
+ yasf!
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Algonauti
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ yasf
2
+ ====
3
+
4
+ Yet Another Scraper Framework
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'yasf'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install yasf
19
+
20
+ ## Usage
21
+ row_scraper = Yasf.define do
22
+ scrape "h1.title", :title => :text
23
+ scrape "a.brand", :brand => :text, :brand_link => :href
24
+
25
+ result :title, :brand, :brand_link
26
+ end
27
+
28
+ scraper = Yasf.define do
29
+ scrape "table.companies tr.company", :'rows[]' => row_scraper
30
+ result :rows
31
+ end
32
+
33
+ ###And using the scraper:
34
+ url = "http://local.domain"
35
+ results = scraper.extract_from(url)
36
+ result = results.first
37
+ puts result.title
38
+
39
+ ## Contributing
40
+
41
+ 1. Fork it
42
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
43
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
44
+ 4. Push to the branch (`git push origin my-new-feature`)
45
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :default => :spec
7
+ task :test => :spec
@@ -0,0 +1,144 @@
1
+ require "nokogiri"
2
+
3
+ module Yasf
4
+ class Scraper
5
+
6
+ class << self
7
+
8
+ def extract_from(source, options = nil)
9
+ self.new(source, options).extract
10
+ end
11
+
12
+ # Defines a processing rule.
13
+ def scrape(*args)
14
+ name = args.shift if args.first.is_a?(Symbol)
15
+ if args.last.is_a?(Hash)
16
+ extractor = extractor(args.pop)
17
+ end
18
+ raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
19
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if args.empty?
20
+ define_method :__extractor, extractor
21
+ method = instance_method(:__extractor)
22
+ remove_method :__extractor
23
+ rules << [args.pop, method, name]
24
+ end
25
+
26
+ # Returns an array of scraper rules
27
+ def rules()
28
+ @rules ||= []
29
+ end
30
+
31
+ def result(*symbols)
32
+ raise ArgumentError, "one symbol to return the value of this accessor" if symbols.empty?
33
+ symbols = symbols.map {|s| s.to_sym}
34
+ if symbols.size == 1
35
+ define_method :result do
36
+ return self.send(symbols[0])
37
+ end
38
+ else
39
+ struct = Struct.new(*symbols)
40
+ define_method :result do
41
+ return struct.new(*symbols.collect {|s| self.send(s) })
42
+ end
43
+ end
44
+ end
45
+
46
+ # Creates an extractor that will extract values from the selected
47
+ # element and place them in instance variables of the scraper.
48
+ def extractor(map)
49
+ extracts = []
50
+ map.each_pair do |target, source|
51
+ source = extract_value_from(source)
52
+ target = extract_value_to(target)
53
+ define_method :__extractor do |element|
54
+ value = source.call(element)
55
+ target.call(self, value) unless value.nil?
56
+ end
57
+ extracts << instance_method(:__extractor)
58
+ remove_method :__extractor
59
+ end
60
+ lambda do |element|
61
+ extracts.each do |extract|
62
+ extract.bind(self).call(element)
63
+ end
64
+ true
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ # Returns a Proc that will extract a value from an element.
71
+ def extract_value_from(source)
72
+ case source
73
+ when Class
74
+ unless source.ancestors.include?(Yasf::Scraper)
75
+ raise ArgumentError, "Class must extends Yasf::Scraper"
76
+ end
77
+ return lambda { |element| source.new(element).extract }
78
+ when Symbol
79
+ return lambda do |element|
80
+ if element.respond_to?(source)
81
+ element.send(source)
82
+ elsif element.respond_to?("[]", source)
83
+ element.send("[]", source)
84
+ else
85
+ raise ArgumentError, "Method not found"
86
+ end
87
+ end
88
+ end
89
+ end
90
+
91
+ # Returns a Proc that will set the extract value in the object.
92
+ def extract_value_to(target)
93
+ method_name = target.to_s.tr_s("[]", "")
94
+
95
+ attr_accessor method_name
96
+
97
+ if target.to_s.end_with? "[]"
98
+ reader = "#{method_name}".to_sym
99
+ writer = "#{method_name}=".to_sym
100
+ return lambda do |object, value|
101
+ array = object.send(reader)
102
+ object.send(writer, array = []) unless array
103
+ array << value
104
+ end
105
+ else
106
+ reader = "#{method_name}=".to_sym
107
+ return lambda { |object, value| object.send(reader, value) }
108
+ end
109
+
110
+ end
111
+
112
+ end # end self
113
+
114
+ # The argument +source+ is a String (url format), or Nokogiri::XML::Element
115
+ def initialize(source, options = nil)
116
+ @options = options || {}
117
+ case source
118
+ when String
119
+ @document = Nokogiri::HTML(open(source))
120
+ when Nokogiri::XML::Element
121
+ @document = source
122
+ else
123
+ raise ArgumentError, "source not recognized"
124
+ end
125
+ end
126
+
127
+ # Returns the document being processed.
128
+ def document
129
+ @document
130
+ end
131
+
132
+ # Scrapes the document and returns the result.
133
+ def extract
134
+ rules = self.class.rules.clone
135
+ rules.delete_if do |selector, extractor, rule_name|
136
+ document.search(selector).each do |element|
137
+ extractor.bind(self).call(element)
138
+ end
139
+ end
140
+ return result
141
+ end
142
+
143
+ end
144
+ end
@@ -0,0 +1,3 @@
1
+ module Yasf
2
+ VERSION = "0.0.3"
3
+ end
data/lib/yasf.rb ADDED
@@ -0,0 +1,15 @@
1
+ require "open-uri"
2
+
3
+ require "yasf/version"
4
+ require "yasf/scraper"
5
+
6
+ module Yasf
7
+ class << self
8
+ def define(&block)
9
+ kls = Class.new(Scraper)
10
+ kls.module_eval &block if block_given?
11
+ return kls
12
+ end
13
+ end
14
+
15
+ end
File without changes
@@ -0,0 +1,41 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>FakePage</title>
5
+ </head>
6
+ <body>
7
+ <table>
8
+ <tr class="tr_with_title">
9
+ <td>
10
+ <h1 class="title_under_table">Title 1</h1>
11
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 1</a>
12
+ </td>
13
+ </tr>
14
+ <tr class="tr_with_title">
15
+ <td>
16
+ <h1 class="title_under_table">Title 2</h1>
17
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 2</a>
18
+ </td>
19
+ </tr>
20
+ <tr class="tr_with_title">
21
+ <td>
22
+ <h1 class="title_under_table">Title 3</h1>
23
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 3</a>
24
+ </td>
25
+ </tr>
26
+ <tr class="tr_with_title">
27
+ <td>
28
+ <h1 class="title_under_table">Title 4</h1>
29
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 4</a>
30
+ </td>
31
+ </tr>
32
+ <tr class="tr_with_title">
33
+ <td>
34
+ <h1 class="title_under_table">Title 5</h1>
35
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 5</a>
36
+ </td>
37
+ </tr>
38
+ </table>
39
+ </body>
40
+ </html>
41
+
@@ -0,0 +1,10 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>Basic Example</title>
5
+ </head>
6
+ <body>
7
+ <h1 class="title">Title 1</h1>
8
+ </body>
9
+ </html>
10
+
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>FakePage</title>
5
+ </head>
6
+ <body>
7
+ <h1 class="title">Title 1</h1>
8
+ <h1 class="title">Title 2</h1>
9
+ <h1 class="title">Title 3</h1>
10
+ <h1 class="title">Title 4</h1>
11
+ </body>
12
+ </html>
13
+