congress-scrapper 0.0.1

Sign up to get free protection for your applications and to get access to all the features.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in congress-scrapper.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+ # Congress Scrapper
2
+
3
+ ## Introduction
4
+
5
+ Congress Scrapper is a Ruby gem to scrape the official Spanish parliament webpage.
6
+
7
+ ## Usage
8
+
9
+ ```ruby
10
+ proposals = Congress::Scrapper.scrape # Array with Congress proposals
11
+ proposal = proposals.first
12
+
13
+ puts proposal.proposal_type
14
+ >> "Proyecto de Ley"
15
+
16
+ puts proposal.title
17
+ >> "Proyecto de Ley de almacenamiento geológico de dióxido de carbono"
18
+ ```
19
+
20
+ ## Install
21
+
22
+ <pre>
23
+ gem install congress-scrapper
24
+ </pre>
25
+
26
+ ## Contribute
27
+
28
+ 1. Find or create an issue
29
+
30
+ 2. Add a comment to the issue to let people know you're going to work on it
31
+
32
+ 3. Fork
33
+
34
+ 4. Hack your changes in a topic branch (don't forget to write some tests ;)
35
+
36
+ 5. Make pull request
37
+
38
+ 6. Wait for comments from maintainers or code merge
39
+
40
+
41
+ ## Authors
42
+
43
+ Original author: Luismi Cavallé
44
+
45
+ Code extracted as a ruby gem by: Raimond García and Alberto Fernández-Capel
46
+
47
+
48
+ ## License
49
+
50
+ Released under the MIT license.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "congress-scrapper/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "congress-scrapper"
7
+ s.version = Congress::Scrapper::VERSION
8
+ s.authors = ["Luismi Cavallé", "Raimond García", "Alberto Fernández-Capel"]
9
+ s.email = ["voodoorai2000 at gmail"]
10
+ s.homepage = "http://github.com/agoraciudadana/congress-scrapper"
11
+ s.summary = %q{Scrapper to get proposals from Spanish Congress}
12
+ s.description = %q{Scrapper to get proposals from Spanish Congress}
13
+
14
+ s.rubyforge_project = "congress-scrapper"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ s.add_development_dependency "webmock"
24
+
25
+ s.add_runtime_dependency "progressbar"
26
+ s.add_runtime_dependency "mechanize"
27
+ end
@@ -0,0 +1,5 @@
1
+ module Congress
2
+ module Scrapper
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,79 @@
1
+ # coding: utf-8
2
+
3
+ require "congress-scrapper/version"
4
+ require "mechanize"
5
+ require "progressbar"
6
+
7
+ module Congress
8
+ module Scrapper
9
+ extend self
10
+
11
+ def agent
12
+ @agent ||= Mechanize.new
13
+ end
14
+
15
+ def scrape
16
+ search_page = agent.get("http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Busqueda%20Avanzada")
17
+ search_form = search_page.form_with(:action => /enviarCgiBuscadorAvIniciativas/)
18
+ search_form["TPTR"] = "Competencia Legislativa Plena"
19
+ results_page = search_form.submit
20
+
21
+ total_results = results_page.search("//*[contains(text(), 'Iniciativas encontradas')]/span").first.text.to_i
22
+ progress = ProgressBar.new("Scrapping", total_results)
23
+
24
+ proposals = []
25
+
26
+ while results_page
27
+ results_page.search(".titulo_iniciativa a").each do |title|
28
+ @proposal_page = agent.get(title[:href])
29
+
30
+ proposal_type = clean_text(text_for(".subtitulo_competencias"))
31
+
32
+ resolution = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Resultado de la tramitación')]/following-sibling::*[@class='texto']"))
33
+
34
+ commission_name = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Comisión competente:')]/following-sibling::*[@class='texto']"))
35
+
36
+ proposer = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Autor:')]/following-sibling::*[@class='texto']"))
37
+
38
+ proposed_at_text = text_for("//*[@class='texto' and contains(normalize-space(text()),'Presentado el')]")
39
+ proposed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if proposed_at_text && proposed_at_text.match(/Presentado\s+el\s+(\d\d)\/(\d\d)\/(\d\d\d\d)/)
40
+
41
+ closed_at_text = text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Tramitación seguida por la iniciativa:')]/following-sibling::*[@class='texto']")
42
+ closed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if closed_at_text && closed_at_text.match(/Concluido\s+.+\s+desde (\d\d)\/(\d\d)\/(\d\d\d\d)/)
43
+
44
+ proposal = {:title => clean_text(title.content),
45
+ :official_url => "http://www.congreso.es" + title[:href],
46
+ :proposal_type => proposal_type,
47
+ :closed_at => closed_at,
48
+ :official_resolution => resolution,
49
+ :commission_name => commission_name,
50
+ :proposer => proposer,
51
+ :proposed_at => proposed_at}
52
+
53
+ progress.inc
54
+
55
+ proposals << proposal
56
+ end
57
+
58
+ next_page = results_page.link_with(:text => /Siguiente/)
59
+ results_page = next_page.nil? ? nil : next_page.click
60
+ end
61
+
62
+ progress.finish
63
+
64
+ proposals
65
+ end
66
+
67
+ private
68
+
69
+ def text_for(selector)
70
+ element = @proposal_page.search(selector).first
71
+ element.nil? ? nil : element.content
72
+ end
73
+
74
+ def clean_text(text)
75
+ return unless text
76
+ text.gsub(/\s+/,' ').gsub(/\s*\.\s*$/, '').strip
77
+ end
78
+ end
79
+ end