yasf 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,19 @@
1
+ *.gem
2
+ *.rbc
3
+ *.swp
4
+ .bundle
5
+ .config
6
+ coverage
7
+ InstalledFiles
8
+ lib/bundler/man
9
+ pkg
10
+ rdoc
11
+ spec/reports
12
+ test/tmp
13
+ test/version_tmp
14
+ tmp
15
+
16
+ # YARD artifacts
17
+ .yardoc
18
+ _yardoc
19
+ doc/
data/.rvmrc ADDED
@@ -0,0 +1,52 @@
1
+ #!/usr/bin/env bash
2
+
3
+ # This is an RVM Project .rvmrc file, used to automatically load the ruby
4
+ # development environment upon cd'ing into the directory
5
+
6
+ # First we specify our desired <ruby>[@<gemset>], the @gemset name is optional,
7
+ # Only full ruby name is supported here, for short names use:
8
+ # echo "rvm use 1.9.3" > .rvmrc
9
+ environment_id="ruby-1.9.3@yasf"
10
+
11
+ # Uncomment the following lines if you want to verify rvm version per project
12
+ # rvmrc_rvm_version="1.15.9 ()" # 1.10.1 seams as a safe start
13
+ # eval "$(echo ${rvm_version}.${rvmrc_rvm_version} | awk -F. '{print "[[ "$1*65536+$2*256+$3" -ge "$4*65536+$5*256+$6" ]]"}' )" || {
14
+ # echo "This .rvmrc file requires at least RVM ${rvmrc_rvm_version}, aborting loading."
15
+ # return 1
16
+ # }
17
+
18
+ # First we attempt to load the desired environment directly from the environment
19
+ # file. This is very fast and efficient compared to running through the entire
20
+ # CLI and selector. If you want feedback on which environment was used then
21
+ # insert the word 'use' after --create as this triggers verbose mode.
22
+ if [[ -d "${rvm_path:-$HOME/.rvm}/environments"
23
+ && -s "${rvm_path:-$HOME/.rvm}/environments/$environment_id" ]]
24
+ then
25
+ \. "${rvm_path:-$HOME/.rvm}/environments/$environment_id"
26
+ [[ -s "${rvm_path:-$HOME/.rvm}/hooks/after_use" ]] &&
27
+ \. "${rvm_path:-$HOME/.rvm}/hooks/after_use" || true
28
+ if [[ $- == *i* ]] # check for interactive shells
29
+ then echo "Using: $(tput setaf 2)$GEM_HOME$(tput sgr0)" # show the user the ruby and gemset they are using in green
30
+ else echo "Using: $GEM_HOME" # don't use colors in non-interactive shells
31
+ fi
32
+ else
33
+ # If the environment file has not yet been created, use the RVM CLI to select.
34
+ rvm --create use "$environment_id" || {
35
+ echo "Failed to create RVM environment '${environment_id}'."
36
+ return 1
37
+ }
38
+ fi
39
+
40
+ # If you use bundler, this might be useful to you:
41
+ # if [[ -s Gemfile ]] && {
42
+ # ! builtin command -v bundle >/dev/null ||
43
+ # builtin command -v bundle | GREP_OPTIONS= \grep $rvm_path/bin/bundle >/dev/null
44
+ # }
45
+ # then
46
+ # printf "%b" "The rubygem 'bundler' is not installed. Installing it now.\n"
47
+ # gem install bundler
48
+ # fi
49
+ # if [[ -s Gemfile ]] && builtin command -v bundle >/dev/null
50
+ # then
51
+ # bundle install | GREP_OPTIONS= \grep -vE '^Using|Your bundle is complete'
52
+ # fi
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in yasf.gemspec
4
+ gemspec
data/Gemfile.lock ADDED
@@ -0,0 +1,30 @@
1
+ PATH
2
+ remote: .
3
+ specs:
4
+ yasf (0.0.1)
5
+ nokogiri (= 1.5.5)
6
+
7
+ GEM
8
+ remote: http://rubygems.org/
9
+ specs:
10
+ diff-lcs (1.1.3)
11
+ fakeweb (1.3.0)
12
+ nokogiri (1.5.5)
13
+ rake (0.9.2.2)
14
+ rspec (2.11.0)
15
+ rspec-core (~> 2.11.0)
16
+ rspec-expectations (~> 2.11.0)
17
+ rspec-mocks (~> 2.11.0)
18
+ rspec-core (2.11.1)
19
+ rspec-expectations (2.11.3)
20
+ diff-lcs (~> 1.1.3)
21
+ rspec-mocks (2.11.3)
22
+
23
+ PLATFORMS
24
+ ruby
25
+
26
+ DEPENDENCIES
27
+ fakeweb
28
+ rake
29
+ rspec
30
+ yasf!
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2012 Algonauti
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,45 @@
1
+ yasf
2
+ ====
3
+
4
+ Yet Another Scraper Framework
5
+
6
+ ## Installation
7
+
8
+ Add this line to your application's Gemfile:
9
+
10
+ gem 'yasf'
11
+
12
+ And then execute:
13
+
14
+ $ bundle
15
+
16
+ Or install it yourself as:
17
+
18
+ $ gem install yasf
19
+
20
+ ## Usage
21
+ row_scraper = Yasf.define do
22
+ scrape "h1.title", :title => :text
23
+ scrape "a.brand", :brand => :text, :brand_link => :href
24
+
25
+ result :title, :brand, :brand_link
26
+ end
27
+
28
+ scraper = Yasf.define do
29
+ scrape "table.companies tr.company", :'rows[]' => row_scraper
30
+ result :rows
31
+ end
32
+
33
+ ###And using the scraper:
34
+ url = "http://local.domain"
35
+ results = scraper.extract_from(url)
36
+ result = results.first
37
+ puts result.title
38
+
39
+ ## Contributing
40
+
41
+ 1. Fork it
42
+ 2. Create your feature branch (`git checkout -b my-new-feature`)
43
+ 3. Commit your changes (`git commit -am 'Add some feature'`)
44
+ 4. Push to the branch (`git push origin my-new-feature`)
45
+ 5. Create new Pull Request
data/Rakefile ADDED
@@ -0,0 +1,7 @@
1
+ require "bundler/gem_tasks"
2
+ require "rspec/core/rake_task"
3
+
4
+ RSpec::Core::RakeTask.new
5
+
6
+ task :default => :spec
7
+ task :test => :spec
@@ -0,0 +1,144 @@
1
+ require "nokogiri"
2
+
3
+ module Yasf
4
+ class Scraper
5
+
6
+ class << self
7
+
8
+ def extract_from(source, options = nil)
9
+ self.new(source, options).extract
10
+ end
11
+
12
+ # Defines a processing rule.
13
+ def scrape(*args)
14
+ name = args.shift if args.first.is_a?(Symbol)
15
+ if args.last.is_a?(Hash)
16
+ extractor = extractor(args.pop)
17
+ end
18
+ raise ArgumentError, "Missing extractor: the last argument tells us what to extract" unless extractor
19
+ raise ArgumentError, "Missing selector: the first argument tells us what to select" if args.empty?
20
+ define_method :__extractor, extractor
21
+ method = instance_method(:__extractor)
22
+ remove_method :__extractor
23
+ rules << [args.pop, method, name]
24
+ end
25
+
26
+ # Returns an array of scraper rules
27
+ def rules()
28
+ @rules ||= []
29
+ end
30
+
31
+ def result(*symbols)
32
+ raise ArgumentError, "one symbol to return the value of this accessor" if symbols.empty?
33
+ symbols = symbols.map {|s| s.to_sym}
34
+ if symbols.size == 1
35
+ define_method :result do
36
+ return self.send(symbols[0])
37
+ end
38
+ else
39
+ struct = Struct.new(*symbols)
40
+ define_method :result do
41
+ return struct.new(*symbols.collect {|s| self.send(s) })
42
+ end
43
+ end
44
+ end
45
+
46
+ # Creates an extractor that will extract values from the selected
47
+ # element and place them in instance variables of the scraper.
48
+ def extractor(map)
49
+ extracts = []
50
+ map.each_pair do |target, source|
51
+ source = extract_value_from(source)
52
+ target = extract_value_to(target)
53
+ define_method :__extractor do |element|
54
+ value = source.call(element)
55
+ target.call(self, value) unless value.nil?
56
+ end
57
+ extracts << instance_method(:__extractor)
58
+ remove_method :__extractor
59
+ end
60
+ lambda do |element|
61
+ extracts.each do |extract|
62
+ extract.bind(self).call(element)
63
+ end
64
+ true
65
+ end
66
+ end
67
+
68
+ private
69
+
70
+ # Returns a Proc that will extract a value from an element.
71
+ def extract_value_from(source)
72
+ case source
73
+ when Class
74
+ unless source.ancestors.include?(Yasf::Scraper)
75
+ raise ArgumentError, "Class must extends Yasf::Scraper"
76
+ end
77
+ return lambda { |element| source.new(element).extract }
78
+ when Symbol
79
+ return lambda do |element|
80
+ if element.respond_to?(source)
81
+ element.send(source)
82
+ elsif element.respond_to?("[]", source)
83
+ element.send("[]", source)
84
+ else
85
+ raise ArgumentError, "Method not found"
86
+ end
87
+ end
88
+ end
89
+ end
90
+
91
+ # Returns a Proc that will set the extract value in the object.
92
+ def extract_value_to(target)
93
+ method_name = target.to_s.tr_s("[]", "")
94
+
95
+ attr_accessor method_name
96
+
97
+ if target.to_s.end_with? "[]"
98
+ reader = "#{method_name}".to_sym
99
+ writer = "#{method_name}=".to_sym
100
+ return lambda do |object, value|
101
+ array = object.send(reader)
102
+ object.send(writer, array = []) unless array
103
+ array << value
104
+ end
105
+ else
106
+ reader = "#{method_name}=".to_sym
107
+ return lambda { |object, value| object.send(reader, value) }
108
+ end
109
+
110
+ end
111
+
112
+ end # end self
113
+
114
+ # The argument +source+ is a String (url format), or Nokogiri::XML::Element
115
+ def initialize(source, options = nil)
116
+ @options = options || {}
117
+ case source
118
+ when String
119
+ @document = Nokogiri::HTML(open(source))
120
+ when Nokogiri::XML::Element
121
+ @document = source
122
+ else
123
+ raise ArgumentError, "source not recognized"
124
+ end
125
+ end
126
+
127
+ # Returns the document being processed.
128
+ def document
129
+ @document
130
+ end
131
+
132
+ # Scrapes the document and returns the result.
133
+ def extract
134
+ rules = self.class.rules.clone
135
+ rules.delete_if do |selector, extractor, rule_name|
136
+ document.search(selector).each do |element|
137
+ extractor.bind(self).call(element)
138
+ end
139
+ end
140
+ return result
141
+ end
142
+
143
+ end
144
+ end
@@ -0,0 +1,3 @@
1
+ module Yasf
2
+ VERSION = "0.0.3"
3
+ end
data/lib/yasf.rb ADDED
@@ -0,0 +1,15 @@
1
+ require "open-uri"
2
+
3
+ require "yasf/version"
4
+ require "yasf/scraper"
5
+
6
+ module Yasf
7
+ class << self
8
+ def define(&block)
9
+ kls = Class.new(Scraper)
10
+ kls.module_eval &block if block_given?
11
+ return kls
12
+ end
13
+ end
14
+
15
+ end
File without changes
@@ -0,0 +1,41 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>FakePage</title>
5
+ </head>
6
+ <body>
7
+ <table>
8
+ <tr class="tr_with_title">
9
+ <td>
10
+ <h1 class="title_under_table">Title 1</h1>
11
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 1</a>
12
+ </td>
13
+ </tr>
14
+ <tr class="tr_with_title">
15
+ <td>
16
+ <h1 class="title_under_table">Title 2</h1>
17
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 2</a>
18
+ </td>
19
+ </tr>
20
+ <tr class="tr_with_title">
21
+ <td>
22
+ <h1 class="title_under_table">Title 3</h1>
23
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 3</a>
24
+ </td>
25
+ </tr>
26
+ <tr class="tr_with_title">
27
+ <td>
28
+ <h1 class="title_under_table">Title 4</h1>
29
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 4</a>
30
+ </td>
31
+ </tr>
32
+ <tr class="tr_with_title">
33
+ <td>
34
+ <h1 class="title_under_table">Title 5</h1>
35
+ <a href="http://linkto.title.one" class="title_under_table">Link Title 5</a>
36
+ </td>
37
+ </tr>
38
+ </table>
39
+ </body>
40
+ </html>
41
+
@@ -0,0 +1,10 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>Basic Example</title>
5
+ </head>
6
+ <body>
7
+ <h1 class="title">Title 1</h1>
8
+ </body>
9
+ </html>
10
+
@@ -0,0 +1,13 @@
1
+ <!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
2
+ <html>
3
+ <head>
4
+ <title>FakePage</title>
5
+ </head>
6
+ <body>
7
+ <h1 class="title">Title 1</h1>
8
+ <h1 class="title">Title 2</h1>
9
+ <h1 class="title">Title 3</h1>
10
+ <h1 class="title">Title 4</h1>
11
+ </body>
12
+ </html>
13
+