web-scraper 0.0.5

Sign up to get free protection for your applications and to get access to all the features.
@@ -0,0 +1,5 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
5
+ *.log
data/.rspec ADDED
@@ -0,0 +1,2 @@
1
+ --format Fuubar
2
+ --color
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in web-scraper.gemspec
4
+ gemspec
@@ -0,0 +1,7 @@
1
+ == Web Scraper
2
+
3
+ Alpha version of a web scraper idea created for an internal use with an
4
+ abstraction aim.
5
+
6
+ In my mind it should be developed further in order to create a dsl to scrape
7
+ web pages in a simple manner
@@ -0,0 +1 @@
1
+ require 'bundler/gem_tasks'
@@ -0,0 +1,20 @@
1
+ require "logger"
2
+
3
+ class LoggerConfigurer
4
+
5
+ DEFAULT_FILENAME = 'web-scraper.log'
6
+
7
+ def self.create_logger(options = {})
8
+
9
+ @logger = Logger.new logger_device(options[:log_to_file])
10
+ @logger.level = options[:log_level].to_i if options[:log_level]
11
+ @logger
12
+ end
13
+
14
+ def self.logger_device(file_option)
15
+ return nil if not file_option and ENV["WEBSCRAPER_ENV"] == 'test'
16
+
17
+ file_option ? DEFAULT_FILENAME : STDOUT
18
+ end
19
+
20
+ end
@@ -0,0 +1,23 @@
1
+ # encoding: utf-8
2
+ require "nokogiri"
3
+
4
+ # == ScrapedResponse
5
+ # Will hold values from the scraping of the page
6
+ class ScrapedResponse
7
+ # The constructor define at runtime all the methods name
8
+ # contained in the key of the hash that will return the computation of the
9
+ # XPath of the hash value
10
+ def initialize(page_content, options)
11
+ # puts page_content
12
+ @document = Nokogiri::HTML(page_content)
13
+ ScrapedResponse.create_methods(options)
14
+ end
15
+
16
+ def self.create_methods(options)
17
+ options.each do |key,value|
18
+ define_method(key) do
19
+ @document.xpath(value).children.first.text.strip!
20
+ end
21
+ end
22
+ end
23
+ end
@@ -0,0 +1,5 @@
1
+ module Web
2
+ module Scraper
3
+ VERSION = "0.0.5"
4
+ end
5
+ end
@@ -0,0 +1,35 @@
1
+ require "web-scraper/version"
2
+ require "web-scraper/logger_configurer"
3
+ require "web-scraper/scraped_response"
4
+
5
+ require 'httpclient'
6
+
7
+ # == Web Scraper
8
+ #
9
+ module WebScraper
10
+ # web_page is the method that will start the download type description
11
+ # options
12
+ def web_page(base_url, options = {} ,&block)
13
+ @logger = LoggerConfigurer.create_logger(options)
14
+ @client = HTTPClient.new
15
+ @base_url = base_url
16
+
17
+ block_given? ? self.instance_eval(&block) : self.download(:uri => '/')
18
+ end
19
+
20
+ # download method
21
+ def download(params)
22
+ @logger.info("downloading... #{self.url(params[:uri])}")
23
+ @logger.debug(" with params... #{params[:params]}")
24
+ @last_page = @client.post(self.url(params[:uri]), :body => params[:params], :follow_redirect => true).body
25
+ end
26
+
27
+ def scrape(options)
28
+ ScrapedResponse.new(@last_page, options)
29
+ end
30
+
31
+ def url(uri)
32
+ "#{@base_url}#{uri}"
33
+ end
34
+
35
+ end
@@ -0,0 +1,42 @@
1
+ require "spec_helper"
2
+
3
+ describe LoggerConfigurer do
4
+
5
+ before(:all) do
6
+ ENV['WEBSCRAPER_ENV'] = 'logger_test'
7
+ end
8
+
9
+ after(:all) do
10
+ ENV['WEBSCRAPER_ENV'] = 'test'
11
+ end
12
+
13
+ it "should create a Logger object w/ no options" do
14
+ LoggerConfigurer.create_logger.should be_a Logger
15
+ end
16
+
17
+ it "should check for a :log_level option" do
18
+ LoggerConfigurer.create_logger(:log_level => Logger::INFO).level.should be Logger::INFO
19
+ end
20
+
21
+ context "output to console or file " do
22
+ before(:each) do
23
+ @output = ''
24
+ $stdout.stub!( :write ) { |*args| @output.<<( *args )}
25
+ end
26
+
27
+ it "should return a stdout w/o params" do
28
+ LoggerConfigurer.create_logger.info "testing logger"
29
+ @output.should include("testing logger")
30
+ end
31
+
32
+ it "should return stdout for a log_to_file => false option" do
33
+ LoggerConfigurer.create_logger(:log_to_file => false).info "testing logger"
34
+ @output.should include("testing logger")
35
+ end
36
+
37
+ it "should return FileOutputter for a :file => true option" do
38
+ LoggerConfigurer.create_logger(:log_to_file => true).info "testing logger"
39
+ @output.should be_empty
40
+ end
41
+ end
42
+ end
@@ -0,0 +1,103 @@
1
+ <html>
2
+ <head>
3
+ <style type="text/css">
4
+
5
+ <!--
6
+ @import "pvgistext.css";
7
+ -->
8
+ </style>
9
+ <title>
10
+ PV power estimate information
11
+ </title>
12
+ <meta http-equiv="Content-Type" content="text/html">
13
+ <meta name="author" content="Created by Thomas Huld - JRC ISPRA - ITALY, Thomas.Huld@jrc.it">
14
+ </head>
15
+ <body text="#000000" bgcolor="#F5F5FF" >
16
+
17
+
18
+ <table width="100%" border="0">
19
+ <tr>
20
+ <td align="left" >
21
+
22
+ <p class=title>
23
+ Performance of Grid-connected PV
24
+ <br>
25
+ </p>
26
+ <p>
27
+ <font color=\"red\"> NOTE: before using these calculations for anything serious, you should read </font><a href="javascript:window.open('PVcalchelp_' + 'en' + '.html','help','height=600,width=600,toolbar=no,scrollbars=yes,resizable'); void 0;"><font color=blue> [this]</font></a>
28
+ </p>
29
+ <p class=subtitle align="left">
30
+ PVGIS estimates of solar electricity generation
31
+ </p><br><font color="red">Warning: the system could not read the horizon file supplied, using standard horizon</font><br>Location: 45°49'3" North, 8°49'34" East, Elevation: 389 m a.s.l.,<br><br>
32
+ </td>
33
+ </tr>
34
+ <tr>
35
+ <td align="left">
36
+
37
+ <br>
38
+ Solar radiation database used: PVGIS-classic
39
+ <br>
40
+ <br>
41
+
42
+ Nominal power of the PV system: 1.0 kW (crystalline silicon)<br>
43
+
44
+
45
+ Estimated losses due to temperature: 12.3% (using local ambient temperature)<br>
46
+
47
+ Estimated loss due to angular reflectance effects: 3.3%<br>
48
+ Other losses (cables, inverter etc.): 8.0%<br>
49
+ Combined PV system losses: 22.0%
50
+
51
+ </td>
52
+ </tr>
53
+ <tr>
54
+ <td align="left">
55
+ <br>
56
+ </td>
57
+ </tr> <tr><td align="left">
58
+ <table class=data_table border="1" width="300" >
59
+ <tr>
60
+ <td colspan=5>
61
+ <b>Fixed system: inclination=23°, orientation=60° </b>
62
+ </td>
63
+ </tr>
64
+ <tr>
65
+
66
+ <th align="left">Month</th><th width=40><span class=formula>E</span><sub>d</sub> </th><th width=40> <span class=formula>E</span><sub>m</sub> </th><th width=40><span class=formula>H</span><sub>d</sub></th><th width=40> <span class=formula>H</span><sub>m</sub> </th></tr>
67
+ <tr> <td> Jan </td><td align="right">1.45</td><td align="right">45.0</td><td align="right">1.71</td><td align="right">53.1</td></tr>
68
+ <tr> <td> Feb </td><td align="right">1.87</td><td align="right">52.5</td><td align="right">2.24</td><td align="right">62.7</td></tr>
69
+ <tr> <td> Mar </td><td align="right">2.90</td><td align="right">90.0</td><td align="right">3.61</td><td align="right">112</td></tr>
70
+ <tr> <td> Apr </td><td align="right">3.32</td><td align="right">99.5</td><td align="right">4.23</td><td align="right">127</td></tr>
71
+ <tr> <td> May </td><td align="right">3.50</td><td align="right">108</td><td align="right">4.59</td><td align="right">142</td></tr>
72
+ <tr> <td> Jun </td><td align="right">4.17</td><td align="right">125</td><td align="right">5.56</td><td align="right">167</td></tr>
73
+ <tr> <td> Jul </td><td align="right">4.31</td><td align="right">134</td><td align="right">5.79</td><td align="right">180</td></tr>
74
+ <tr> <td> Aug </td><td align="right">3.90</td><td align="right">121</td><td align="right">5.26</td><td align="right">163</td></tr>
75
+ <tr> <td> Sep </td><td align="right">3.17</td><td align="right">95.1</td><td align="right">4.15</td><td align="right">124</td></tr>
76
+ <tr> <td> Oct </td><td align="right">2.12</td><td align="right">65.7</td><td align="right">2.66</td><td align="right">82.4</td></tr>
77
+ <tr> <td> Nov </td><td align="right">1.47</td><td align="right">44.1</td><td align="right">1.78</td><td align="right">53.4</td></tr>
78
+ <tr> <td> Dec </td><td align="right">1.28</td><td align="right">39.5</td><td align="right">1.53</td><td align="right">47.3</td></tr><tr><td colspan=5> </td></tr><tr><td><b> Yearly average </b></td><td align="right"><b>2.79 </b></td><td align="right"><b>85.0 </b></td></td><td align="right"><b>3.60 </b></td><td align="right"><b>109 </b></td></tr><tr><td><b>Total for year</b></td><td align="right" colspan=2 ><b> 1020 </b> </td> <td align="right" colspan=2 ><b> 1310 </b> </td> </tr></table></td></tr><tr><td><br></td></tr><tr><td><p>
79
+ <span class=formula>E</span><sub>d</sub>: Average daily electricity production from the given system (kWh)<br>
80
+ <span class=formula>E</span><sub>m</sub>: Average monthly electricity production from the given system (kWh)<br>
81
+ <span class=formula>H</span><sub>d</sub>: Average daily sum of global irradiation per square meter received by the modules of the given system (kWh/m<sup>2</sup>)<br>
82
+ <span class=formula>H</span><sub>m</sub>: Average sum of global irradiation per square meter received by the modules of the given system (kWh/m<sup>2</sup>)<br></tr></td><tr><td><tr>
83
+ <td>
84
+ <br>
85
+ <p>
86
+ PVGIS &copy; European Communities, 2001-2010<br>
87
+ Reproduction is authorised, provided the source is acknowledged<br>
88
+ See the disclaimer <a target="legal" href="http://europa.eu/geninfo/legal_notices_en.htm">
89
+ here </a>
90
+
91
+
92
+ </p>
93
+ </td>
94
+ </tr>
95
+ </table>
96
+ <script language="JavaScript">
97
+ opener.window.setAngles(23,60,1,1);
98
+ </script
99
+ <script language="JavaScript">
100
+ window.focus();
101
+ </script>
102
+ </body>
103
+ </html>
@@ -0,0 +1,37 @@
1
+ # encoding: utf-8
2
+ require 'rubygems'
3
+
4
+ # This file is copied to spec/ when you run 'rails generate rspec:install'
5
+ ENV["WEBSCRAPER_ENV"] ||= 'test'
6
+ Dir["lib/**/*.rb", "spec/support/*.rb"].each { |f| require File.absolute_path(f) }
7
+ require 'rspec/core'
8
+
9
+ def read_resource(filename)
10
+ IO.read("spec/resources/#{filename}")
11
+ end
12
+
13
+ RSpec.configure do |config|
14
+ # == Mock Framework
15
+ #
16
+ # If you prefer to use mocha, flexmock or RR, uncomment the appropriate line:
17
+ #
18
+ # config.mock_with :mocha
19
+ # config.mock_with :flexmock
20
+ # config.mock_with :rr
21
+ config.mock_with :rspec
22
+
23
+ # Remove this line if you're not using ActiveRecord or ActiveRecord fixtures
24
+ # config.fixture_path = "#{::Rails.root}/spec/fixtures"
25
+
26
+ # If you're not using ActiveRecord, or you'd prefer not to run each of your
27
+ # examples within a transaction, remove the following line or assign false
28
+ # instead of true.
29
+ # config.use_transactional_fixtures = true
30
+
31
+ # config.before(:suite) do
32
+ # end
33
+ #
34
+ # config.after(:suite) do
35
+ # end
36
+
37
+ end
@@ -0,0 +1,74 @@
1
+ # encoding: utf-8
2
+ #
3
+ ## Bring up server in a new thread (do once?):
4
+ # @mock_server = MockServer.new(4000, 0.5)
5
+ #
6
+ ## Pull down server:
7
+ # @mock_server.stop
8
+ #
9
+ ## Expectations (rspec example):
10
+ # request_received = false
11
+ # @mock_server.attach do |env|
12
+ # request_received = true
13
+ # env['REQUEST_METHOD'].should == ‘POST’
14
+ # env['PATH_INFO'].should == ‘/foo’
15
+ # [ 200, { 'Content-Type' => 'text/plain', 'Content-Length' => '40' }, [ 'This gets returned from the HTTP request' ]]
16
+ # end
17
+ # request_received.should be_true
18
+ # my_code_that_should_make_post_request # to http://localhost:4000/foo
19
+ #
20
+ ## After each test:
21
+ # @mock_server.detach
22
+ require "WEBrick"
23
+ require 'rack'
24
+
25
+ class TestWebServer
26
+ def initialize(port = 4000, pause = 0.5)
27
+ @block = nil
28
+ @parent_thread = Thread.current
29
+ @thread = Thread.new do
30
+ Rack::Handler::WEBrick.run(self, :Port => port,
31
+ :Logger => WEBrick::Log.new("/dev/null"),
32
+ :AccessLog => [nil, nil])
33
+ end
34
+ sleep pause # give the server time to fire up… YUK!
35
+ end
36
+
37
+ def stop
38
+ Thread.kill(@thread)
39
+ end
40
+
41
+ def attach(&block)
42
+ @block = block
43
+ end
44
+
45
+ def detach()
46
+ @block = nil
47
+ end
48
+
49
+ def call(env)
50
+ begin
51
+ raise "Specify a handler for the request using attach(block), the block should return a valid rack response and can test expectations" unless @block
52
+ @block.call(env)
53
+ rescue Exception => e
54
+ @parent_thread.raise e
55
+ [ 500, { 'Content-Type' => 'text/plain', 'Content-Length' => '13' }, [ 'Bad test code' ]]
56
+ end
57
+ end
58
+
59
+ # body is the content returned
60
+ # :method can be :get - default or :post]
61
+ # :path => '/pippo'
62
+ def stub_response(body, options = {})
63
+ method = options[:method] ? options[:method].to_s.upcase : 'GET'
64
+ path = options[:path] ? options[:path] : '/'
65
+ content_type = options[:content_type] ? options[:content_type] : 'text/plain'
66
+
67
+ attach do |env|
68
+ env['REQUEST_METHOD'].should == method
69
+ env['PATH_INFO'].should == path
70
+ [ 200, { 'Content-Type' => content_type, 'Content-Length' => body.length.to_s }, [ body ]]
71
+ end
72
+ end
73
+
74
+ end
@@ -0,0 +1,79 @@
1
+ # encoding: utf-8
2
+ require "spec_helper"
3
+
4
+ describe WebScraper do
5
+
6
+ before(:each) do
7
+ extend WebScraper
8
+ end
9
+
10
+ context "[w/o web server]" do
11
+ it "should be raised an error when calling download method w/o :uri param" do
12
+ expect { download }.to raise_error
13
+ end
14
+ end
15
+
16
+ context "[w/ web server]" do
17
+ before(:all) do
18
+ $web_server = TestWebServer.new
19
+ end
20
+
21
+ context "starting defining dsl" do
22
+
23
+ before(:each) do
24
+ $web_server.stub_response('Root Page', {:method => :post, :path => '/'})
25
+ end
26
+
27
+ it "should accept a url and return its content" do
28
+ page = web_page "http://localhost:4000"
29
+ page.should eq 'Root Page'
30
+ end
31
+
32
+ it "should execute a block as a param and return its content" do
33
+ page = web_page "http://localhost:4000" do
34
+ "test"
35
+ end
36
+ page.should eq "test"
37
+ end
38
+
39
+ end
40
+
41
+ context "download a page" do
42
+ before(:each) do
43
+ $web_server.stub_response('Web Page Scraped', {:method => :post, :path => '/foo'})
44
+ end
45
+
46
+ it "should download a page" do
47
+ page = web_page "http://localhost:4000" do
48
+ download :uri => "/foo"
49
+ end
50
+ page.should eq "Web Page Scraped"
51
+ end
52
+
53
+ end
54
+
55
+ context "scrape a page" do
56
+ before(:each) do
57
+ $web_server.stub_response(read_resource('page.html'), {:method => :post, :path => '/page.html'})
58
+ end
59
+
60
+ it "should scrape content from a downloaded of page" do
61
+ page = web_page "http://localhost:4000" do
62
+ download :uri => "/page.html"
63
+ scrape :avg_monthly_production => '//table/tr[4]/td/table/tr[17]/td[2]/b'
64
+ end
65
+ page.avg_monthly_production.should eq "1020"
66
+ end
67
+
68
+ end
69
+
70
+ after(:each) do
71
+ $web_server.detach
72
+ end
73
+
74
+ after(:all) do
75
+ $web_server.stop
76
+ end
77
+
78
+ end
79
+ end
@@ -0,0 +1,26 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "web-scraper/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "web-scraper"
7
+ s.version = Web::Scraper::VERSION
8
+ s.authors = ["Gian Carlo Pace"]
9
+ s.email = ["giancarlo.pace@gmail.com"]
10
+ s.homepage = ""
11
+ s.summary = %q{Web Values scraper}
12
+ s.description = %q{It's an utility to scrape web pages}
13
+
14
+ s.rubyforge_project = "web-scraper"
15
+
16
+ s.add_dependency "httpclient", "~> 2.2.1"
17
+ s.add_dependency "rspec"
18
+ s.add_dependency "rack"
19
+ s.add_dependency "nokogiri"
20
+ s.add_dependency "logger"
21
+
22
+ s.files = `git ls-files`.split("\n")
23
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
24
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
25
+ s.require_paths = ["lib"]
26
+ end
metadata ADDED
@@ -0,0 +1,120 @@
1
+ --- !ruby/object:Gem::Specification
2
+ name: web-scraper
3
+ version: !ruby/object:Gem::Version
4
+ version: 0.0.5
5
+ prerelease:
6
+ platform: ruby
7
+ authors:
8
+ - Gian Carlo Pace
9
+ autorequire:
10
+ bindir: bin
11
+ cert_chain: []
12
+ date: 2011-07-07 00:00:00.000000000Z
13
+ dependencies:
14
+ - !ruby/object:Gem::Dependency
15
+ name: httpclient
16
+ requirement: &2160925440 !ruby/object:Gem::Requirement
17
+ none: false
18
+ requirements:
19
+ - - ~>
20
+ - !ruby/object:Gem::Version
21
+ version: 2.2.1
22
+ type: :runtime
23
+ prerelease: false
24
+ version_requirements: *2160925440
25
+ - !ruby/object:Gem::Dependency
26
+ name: rspec
27
+ requirement: &2160925000 !ruby/object:Gem::Requirement
28
+ none: false
29
+ requirements:
30
+ - - ! '>='
31
+ - !ruby/object:Gem::Version
32
+ version: '0'
33
+ type: :runtime
34
+ prerelease: false
35
+ version_requirements: *2160925000
36
+ - !ruby/object:Gem::Dependency
37
+ name: rack
38
+ requirement: &2160924540 !ruby/object:Gem::Requirement
39
+ none: false
40
+ requirements:
41
+ - - ! '>='
42
+ - !ruby/object:Gem::Version
43
+ version: '0'
44
+ type: :runtime
45
+ prerelease: false
46
+ version_requirements: *2160924540
47
+ - !ruby/object:Gem::Dependency
48
+ name: nokogiri
49
+ requirement: &2160924120 !ruby/object:Gem::Requirement
50
+ none: false
51
+ requirements:
52
+ - - ! '>='
53
+ - !ruby/object:Gem::Version
54
+ version: '0'
55
+ type: :runtime
56
+ prerelease: false
57
+ version_requirements: *2160924120
58
+ - !ruby/object:Gem::Dependency
59
+ name: logger
60
+ requirement: &2160923700 !ruby/object:Gem::Requirement
61
+ none: false
62
+ requirements:
63
+ - - ! '>='
64
+ - !ruby/object:Gem::Version
65
+ version: '0'
66
+ type: :runtime
67
+ prerelease: false
68
+ version_requirements: *2160923700
69
+ description: It's an utility to scrape web pages
70
+ email:
71
+ - giancarlo.pace@gmail.com
72
+ executables: []
73
+ extensions: []
74
+ extra_rdoc_files: []
75
+ files:
76
+ - .gitignore
77
+ - .rspec
78
+ - Gemfile
79
+ - README.md
80
+ - Rakefile
81
+ - lib/web-scraper/logger_configurer.rb
82
+ - lib/web-scraper/scraped_response.rb
83
+ - lib/web-scraper/version.rb
84
+ - lib/web_scraper.rb
85
+ - spec/logger_spec.rb
86
+ - spec/resources/page.html
87
+ - spec/spec_helper.rb
88
+ - spec/support/test_web_server.rb
89
+ - spec/web_scraper_spec.rb
90
+ - web-scraper.gemspec
91
+ homepage: ''
92
+ licenses: []
93
+ post_install_message:
94
+ rdoc_options: []
95
+ require_paths:
96
+ - lib
97
+ required_ruby_version: !ruby/object:Gem::Requirement
98
+ none: false
99
+ requirements:
100
+ - - ! '>='
101
+ - !ruby/object:Gem::Version
102
+ version: '0'
103
+ required_rubygems_version: !ruby/object:Gem::Requirement
104
+ none: false
105
+ requirements:
106
+ - - ! '>='
107
+ - !ruby/object:Gem::Version
108
+ version: '0'
109
+ requirements: []
110
+ rubyforge_project: web-scraper
111
+ rubygems_version: 1.8.5
112
+ signing_key:
113
+ specification_version: 3
114
+ summary: Web Values scraper
115
+ test_files:
116
+ - spec/logger_spec.rb
117
+ - spec/resources/page.html
118
+ - spec/spec_helper.rb
119
+ - spec/support/test_web_server.rb
120
+ - spec/web_scraper_spec.rb