congress-scrapper 0.0.1
Sign up to get free protection for your applications and to get access to all the features.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.md +50 -0
- data/Rakefile +1 -0
- data/congress-scrapper.gemspec +27 -0
- data/lib/congress-scrapper/version.rb +5 -0
- data/lib/congress-scrapper.rb +79 -0
- data/spec/fixtures/closed_proposal_page.html +788 -0
- data/spec/fixtures/open_proposal_page.html +651 -0
- data/spec/fixtures/proposers.yml +9 -0
- data/spec/fixtures/search_page.html +881 -0
- data/spec/fixtures/search_results_page1.html +526 -0
- data/spec/fixtures/search_results_page2.html +531 -0
- data/spec/lib/scrapper_spec.rb +63 -0
- data/spec/lib/scrapper_spec_helper.rb +41 -0
- data/spec/spec_helper.rb +7 -0
- metadata +116 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Congress Scrapper
|
2
|
+
|
3
|
+
## Introduction
|
4
|
+
|
5
|
+
Congress Scrapper is a Ruby gem to scrape the official Spanish parliament webpage.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
proposals = Congress::Scrapper.scrape # Array with Congress proposals
|
11
|
+
proposal = proposals.first
|
12
|
+
|
13
|
+
puts proposal.proposal_type
|
14
|
+
>> "Proyecto de Ley"
|
15
|
+
|
16
|
+
puts proposal.title
|
17
|
+
>> "Proyecto de Ley de almacenamiento geológico de dióxido de carbono"
|
18
|
+
```
|
19
|
+
|
20
|
+
## Install
|
21
|
+
|
22
|
+
<pre>
|
23
|
+
gem install congress-scrapper
|
24
|
+
</pre>
|
25
|
+
|
26
|
+
## Contribute
|
27
|
+
|
28
|
+
1. Find or create an issue
|
29
|
+
|
30
|
+
2. Add a comment to the issue to let people know you're going to work on it
|
31
|
+
|
32
|
+
3. Fork
|
33
|
+
|
34
|
+
4. Hack your changes in a topic branch (don't forget to write some tests ;)
|
35
|
+
|
36
|
+
5. Make pull request
|
37
|
+
|
38
|
+
6. Wait for comments from maintainers or code merge
|
39
|
+
|
40
|
+
|
41
|
+
## Authors
|
42
|
+
|
43
|
+
Original author: Luismi Cavallé
|
44
|
+
|
45
|
+
Code extracted as a ruby gem by: Raimond García and Alberto Fernández-Capel
|
46
|
+
|
47
|
+
|
48
|
+
## License
|
49
|
+
|
50
|
+
Released under the MIT license.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "congress-scrapper/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "congress-scrapper"
|
7
|
+
s.version = Congress::Scrapper::VERSION
|
8
|
+
s.authors = ["Luismi Cavallé", "Raimond García", "Alberto Fernández-Capel"]
|
9
|
+
s.email = ["voodoorai2000 at gmail"]
|
10
|
+
s.homepage = "http://github.com/agoraciudadana/congress-scrapper"
|
11
|
+
s.summary = %q{Scrapper to get proposals from Spanish Congress}
|
12
|
+
s.description = %q{Scrapper to get proposals from Spanish Congress}
|
13
|
+
|
14
|
+
s.rubyforge_project = "congress-scrapper"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
s.add_development_dependency "webmock"
|
24
|
+
|
25
|
+
s.add_runtime_dependency "progressbar"
|
26
|
+
s.add_runtime_dependency "mechanize"
|
27
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "congress-scrapper/version"
|
4
|
+
require "mechanize"
|
5
|
+
require "progressbar"
|
6
|
+
|
7
|
+
module Congress
|
8
|
+
module Scrapper
|
9
|
+
extend self
|
10
|
+
|
11
|
+
def agent
|
12
|
+
@agent ||= Mechanize.new
|
13
|
+
end
|
14
|
+
|
15
|
+
def scrape
|
16
|
+
search_page = agent.get("http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Busqueda%20Avanzada")
|
17
|
+
search_form = search_page.form_with(:action => /enviarCgiBuscadorAvIniciativas/)
|
18
|
+
search_form["TPTR"] = "Competencia Legislativa Plena"
|
19
|
+
results_page = search_form.submit
|
20
|
+
|
21
|
+
total_results = results_page.search("//*[contains(text(), 'Iniciativas encontradas')]/span").first.text.to_i
|
22
|
+
progress = ProgressBar.new("Scrapping", total_results)
|
23
|
+
|
24
|
+
proposals = []
|
25
|
+
|
26
|
+
while results_page
|
27
|
+
results_page.search(".titulo_iniciativa a").each do |title|
|
28
|
+
@proposal_page = agent.get(title[:href])
|
29
|
+
|
30
|
+
proposal_type = clean_text(text_for(".subtitulo_competencias"))
|
31
|
+
|
32
|
+
resolution = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Resultado de la tramitación')]/following-sibling::*[@class='texto']"))
|
33
|
+
|
34
|
+
commission_name = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Comisión competente:')]/following-sibling::*[@class='texto']"))
|
35
|
+
|
36
|
+
proposer = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Autor:')]/following-sibling::*[@class='texto']"))
|
37
|
+
|
38
|
+
proposed_at_text = text_for("//*[@class='texto' and contains(normalize-space(text()),'Presentado el')]")
|
39
|
+
proposed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if proposed_at_text && proposed_at_text.match(/Presentado\s+el\s+(\d\d)\/(\d\d)\/(\d\d\d\d)/)
|
40
|
+
|
41
|
+
closed_at_text = text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Tramitación seguida por la iniciativa:')]/following-sibling::*[@class='texto']")
|
42
|
+
closed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if closed_at_text && closed_at_text.match(/Concluido\s+.+\s+desde (\d\d)\/(\d\d)\/(\d\d\d\d)/)
|
43
|
+
|
44
|
+
proposal = {:title => clean_text(title.content),
|
45
|
+
:official_url => "http://www.congreso.es" + title[:href],
|
46
|
+
:proposal_type => proposal_type,
|
47
|
+
:closed_at => closed_at,
|
48
|
+
:official_resolution => resolution,
|
49
|
+
:commission_name => commission_name,
|
50
|
+
:proposer => proposer,
|
51
|
+
:proposed_at => proposed_at}
|
52
|
+
|
53
|
+
progress.inc
|
54
|
+
|
55
|
+
proposals << proposal
|
56
|
+
end
|
57
|
+
|
58
|
+
next_page = results_page.link_with(:text => /Siguiente/)
|
59
|
+
results_page = next_page.nil? ? nil : next_page.click
|
60
|
+
end
|
61
|
+
|
62
|
+
progress.finish
|
63
|
+
|
64
|
+
proposals
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def text_for(selector)
|
70
|
+
element = @proposal_page.search(selector).first
|
71
|
+
element.nil? ? nil : element.content
|
72
|
+
end
|
73
|
+
|
74
|
+
def clean_text(text)
|
75
|
+
return unless text
|
76
|
+
text.gsub(/\s+/,' ').gsub(/\s*\.\s*$/, '').strip
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|