congress-scrapper 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/.gitignore ADDED
@@ -0,0 +1,4 @@
1
+ *.gem
2
+ .bundle
3
+ Gemfile.lock
4
+ pkg/*
data/Gemfile ADDED
@@ -0,0 +1,4 @@
1
+ source "http://rubygems.org"
2
+
3
+ # Specify your gem's dependencies in congress-scrapper.gemspec
4
+ gemspec
data/README.md ADDED
@@ -0,0 +1,50 @@
1
+ # Congress Scrapper
2
+
3
+ ## Introduction
4
+
5
+ Congress Scrapper is a Ruby gem to scrape the official Spanish parliament webpage.
6
+
7
+ ## Usage
8
+
9
+ ```ruby
10
+ proposals = Congress::Scrapper.scrape # Array with Congress proposals
11
+ proposal = proposals.first
12
+
13
+ puts proposal.proposal_type
14
+ >> "Proyecto de Ley"
15
+
16
+ puts proposal.title
17
+ >> "Proyecto de Ley de almacenamiento geológico de dióxido de carbono"
18
+ ```
19
+
20
+ ## Install
21
+
22
+ <pre>
23
+ gem install congress-scrapper
24
+ </pre>
25
+
26
+ ## Contribute
27
+
28
+ 1. Find or create an issue
29
+
30
+ 2. Add a comment to the issue to let people know you're going to work on it
31
+
32
+ 3. Fork
33
+
34
+ 4. Hack your changes in a topic branch (don't forget to write some tests ;)
35
+
36
+ 5. Make pull request
37
+
38
+ 6. Wait for comments from maintainers or code merge
39
+
40
+
41
+ ## Authors
42
+
43
+ Original author: Luismi Cavallé
44
+
45
+ Code extracted as a ruby gem by: Raimond García and Alberto Fernández-Capel
46
+
47
+
48
+ ## License
49
+
50
+ Released under the MIT license.
data/Rakefile ADDED
@@ -0,0 +1 @@
1
+ require "bundler/gem_tasks"
@@ -0,0 +1,27 @@
1
+ # -*- encoding: utf-8 -*-
2
+ $:.push File.expand_path("../lib", __FILE__)
3
+ require "congress-scrapper/version"
4
+
5
+ Gem::Specification.new do |s|
6
+ s.name = "congress-scrapper"
7
+ s.version = Congress::Scrapper::VERSION
8
+ s.authors = ["Luismi Cavallé", "Raimond García", "Alberto Fernández-Capel"]
9
+ s.email = ["voodoorai2000 at gmail"]
10
+ s.homepage = "http://github.com/agoraciudadana/congress-scrapper"
11
+ s.summary = %q{Scrapper to get proposals from Spanish Congress}
12
+ s.description = %q{Scrapper to get proposals from Spanish Congress}
13
+
14
+ s.rubyforge_project = "congress-scrapper"
15
+
16
+ s.files = `git ls-files`.split("\n")
17
+ s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
18
+ s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
19
+ s.require_paths = ["lib"]
20
+
21
+ # specify any dependencies here; for example:
22
+ s.add_development_dependency "rspec"
23
+ s.add_development_dependency "webmock"
24
+
25
+ s.add_runtime_dependency "progressbar"
26
+ s.add_runtime_dependency "mechanize"
27
+ end
@@ -0,0 +1,5 @@
1
+ module Congress
2
+ module Scrapper
3
+ VERSION = "0.0.1"
4
+ end
5
+ end
@@ -0,0 +1,79 @@
1
+ # coding: utf-8
2
+
3
+ require "congress-scrapper/version"
4
+ require "mechanize"
5
+ require "progressbar"
6
+
7
+ module Congress
8
+ module Scrapper
9
+ extend self
10
+
11
+ def agent
12
+ @agent ||= Mechanize.new
13
+ end
14
+
15
+ def scrape
16
+ search_page = agent.get("http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Busqueda%20Avanzada")
17
+ search_form = search_page.form_with(:action => /enviarCgiBuscadorAvIniciativas/)
18
+ search_form["TPTR"] = "Competencia Legislativa Plena"
19
+ results_page = search_form.submit
20
+
21
+ total_results = results_page.search("//*[contains(text(), 'Iniciativas encontradas')]/span").first.text.to_i
22
+ progress = ProgressBar.new("Scrapping", total_results)
23
+
24
+ proposals = []
25
+
26
+ while results_page
27
+ results_page.search(".titulo_iniciativa a").each do |title|
28
+ @proposal_page = agent.get(title[:href])
29
+
30
+ proposal_type = clean_text(text_for(".subtitulo_competencias"))
31
+
32
+ resolution = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Resultado de la tramitación')]/following-sibling::*[@class='texto']"))
33
+
34
+ commission_name = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Comisión competente:')]/following-sibling::*[@class='texto']"))
35
+
36
+ proposer = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Autor:')]/following-sibling::*[@class='texto']"))
37
+
38
+ proposed_at_text = text_for("//*[@class='texto' and contains(normalize-space(text()),'Presentado el')]")
39
+ proposed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if proposed_at_text && proposed_at_text.match(/Presentado\s+el\s+(\d\d)\/(\d\d)\/(\d\d\d\d)/)
40
+
41
+ closed_at_text = text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Tramitación seguida por la iniciativa:')]/following-sibling::*[@class='texto']")
42
+ closed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if closed_at_text && closed_at_text.match(/Concluido\s+.+\s+desde (\d\d)\/(\d\d)\/(\d\d\d\d)/)
43
+
44
+ proposal = {:title => clean_text(title.content),
45
+ :official_url => "http://www.congreso.es" + title[:href],
46
+ :proposal_type => proposal_type,
47
+ :closed_at => closed_at,
48
+ :official_resolution => resolution,
49
+ :commission_name => commission_name,
50
+ :proposer => proposer,
51
+ :proposed_at => proposed_at}
52
+
53
+ progress.inc
54
+
55
+ proposals << proposal
56
+ end
57
+
58
+ next_page = results_page.link_with(:text => /Siguiente/)
59
+ results_page = next_page.nil? ? nil : next_page.click
60
+ end
61
+
62
+ progress.finish
63
+
64
+ proposals
65
+ end
66
+
67
+ private
68
+
69
+ def text_for(selector)
70
+ element = @proposal_page.search(selector).first
71
+ element.nil? ? nil : element.content
72
+ end
73
+
74
+ def clean_text(text)
75
+ return unless text
76
+ text.gsub(/\s+/,' ').gsub(/\s*\.\s*$/, '').strip
77
+ end
78
+ end
79
+ end