congress-scrapper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/.gitignore +4 -0
- data/Gemfile +4 -0
- data/README.md +50 -0
- data/Rakefile +1 -0
- data/congress-scrapper.gemspec +27 -0
- data/lib/congress-scrapper/version.rb +5 -0
- data/lib/congress-scrapper.rb +79 -0
- data/spec/fixtures/closed_proposal_page.html +788 -0
- data/spec/fixtures/open_proposal_page.html +651 -0
- data/spec/fixtures/proposers.yml +9 -0
- data/spec/fixtures/search_page.html +881 -0
- data/spec/fixtures/search_results_page1.html +526 -0
- data/spec/fixtures/search_results_page2.html +531 -0
- data/spec/lib/scrapper_spec.rb +63 -0
- data/spec/lib/scrapper_spec_helper.rb +41 -0
- data/spec/spec_helper.rb +7 -0
- metadata +116 -0
data/.gitignore
ADDED
data/Gemfile
ADDED
data/README.md
ADDED
@@ -0,0 +1,50 @@
|
|
1
|
+
# Congress Scrapper
|
2
|
+
|
3
|
+
## Introduction
|
4
|
+
|
5
|
+
Congress Scrapper is a Ruby gem to scrape the official Spanish parliament webpage.
|
6
|
+
|
7
|
+
## Usage
|
8
|
+
|
9
|
+
```ruby
|
10
|
+
proposals = Congress::Scrapper.scrape # Array with Congress proposals
|
11
|
+
proposal = proposals.first
|
12
|
+
|
13
|
+
puts proposal.proposal_type
|
14
|
+
>> "Proyecto de Ley"
|
15
|
+
|
16
|
+
puts proposal.title
|
17
|
+
>> "Proyecto de Ley de almacenamiento geológico de dióxido de carbono"
|
18
|
+
```
|
19
|
+
|
20
|
+
## Install
|
21
|
+
|
22
|
+
<pre>
|
23
|
+
gem install congress-scrapper
|
24
|
+
</pre>
|
25
|
+
|
26
|
+
## Contribute
|
27
|
+
|
28
|
+
1. Find or create an issue
|
29
|
+
|
30
|
+
2. Add a comment to the issue to let people know you're going to work on it
|
31
|
+
|
32
|
+
3. Fork
|
33
|
+
|
34
|
+
4. Hack your changes in a topic branch (don't forget to write some tests ;)
|
35
|
+
|
36
|
+
5. Make pull request
|
37
|
+
|
38
|
+
6. Wait for comments from maintainers or code merge
|
39
|
+
|
40
|
+
|
41
|
+
## Authors
|
42
|
+
|
43
|
+
Original author: Luismi Cavallé
|
44
|
+
|
45
|
+
Code extracted as a ruby gem by: Raimond García and Alberto Fernández-Capel
|
46
|
+
|
47
|
+
|
48
|
+
## License
|
49
|
+
|
50
|
+
Released under the MIT license.
|
data/Rakefile
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
require "bundler/gem_tasks"
|
@@ -0,0 +1,27 @@
|
|
1
|
+
# -*- encoding: utf-8 -*-
|
2
|
+
$:.push File.expand_path("../lib", __FILE__)
|
3
|
+
require "congress-scrapper/version"
|
4
|
+
|
5
|
+
Gem::Specification.new do |s|
|
6
|
+
s.name = "congress-scrapper"
|
7
|
+
s.version = Congress::Scrapper::VERSION
|
8
|
+
s.authors = ["Luismi Cavallé", "Raimond García", "Alberto Fernández-Capel"]
|
9
|
+
s.email = ["voodoorai2000 at gmail"]
|
10
|
+
s.homepage = "http://github.com/agoraciudadana/congress-scrapper"
|
11
|
+
s.summary = %q{Scrapper to get proposals from Spanish Congress}
|
12
|
+
s.description = %q{Scrapper to get proposals from Spanish Congress}
|
13
|
+
|
14
|
+
s.rubyforge_project = "congress-scrapper"
|
15
|
+
|
16
|
+
s.files = `git ls-files`.split("\n")
|
17
|
+
s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n")
|
18
|
+
s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) }
|
19
|
+
s.require_paths = ["lib"]
|
20
|
+
|
21
|
+
# specify any dependencies here; for example:
|
22
|
+
s.add_development_dependency "rspec"
|
23
|
+
s.add_development_dependency "webmock"
|
24
|
+
|
25
|
+
s.add_runtime_dependency "progressbar"
|
26
|
+
s.add_runtime_dependency "mechanize"
|
27
|
+
end
|
@@ -0,0 +1,79 @@
|
|
1
|
+
# coding: utf-8
|
2
|
+
|
3
|
+
require "congress-scrapper/version"
|
4
|
+
require "mechanize"
|
5
|
+
require "progressbar"
|
6
|
+
|
7
|
+
module Congress
|
8
|
+
module Scrapper
|
9
|
+
extend self
|
10
|
+
|
11
|
+
def agent
|
12
|
+
@agent ||= Mechanize.new
|
13
|
+
end
|
14
|
+
|
15
|
+
def scrape
|
16
|
+
search_page = agent.get("http://www.congreso.es/portal/page/portal/Congreso/Congreso/Iniciativas/Busqueda%20Avanzada")
|
17
|
+
search_form = search_page.form_with(:action => /enviarCgiBuscadorAvIniciativas/)
|
18
|
+
search_form["TPTR"] = "Competencia Legislativa Plena"
|
19
|
+
results_page = search_form.submit
|
20
|
+
|
21
|
+
total_results = results_page.search("//*[contains(text(), 'Iniciativas encontradas')]/span").first.text.to_i
|
22
|
+
progress = ProgressBar.new("Scrapping", total_results)
|
23
|
+
|
24
|
+
proposals = []
|
25
|
+
|
26
|
+
while results_page
|
27
|
+
results_page.search(".titulo_iniciativa a").each do |title|
|
28
|
+
@proposal_page = agent.get(title[:href])
|
29
|
+
|
30
|
+
proposal_type = clean_text(text_for(".subtitulo_competencias"))
|
31
|
+
|
32
|
+
resolution = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Resultado de la tramitación')]/following-sibling::*[@class='texto']"))
|
33
|
+
|
34
|
+
commission_name = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Comisión competente:')]/following-sibling::*[@class='texto']"))
|
35
|
+
|
36
|
+
proposer = clean_text(text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Autor:')]/following-sibling::*[@class='texto']"))
|
37
|
+
|
38
|
+
proposed_at_text = text_for("//*[@class='texto' and contains(normalize-space(text()),'Presentado el')]")
|
39
|
+
proposed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if proposed_at_text && proposed_at_text.match(/Presentado\s+el\s+(\d\d)\/(\d\d)\/(\d\d\d\d)/)
|
40
|
+
|
41
|
+
closed_at_text = text_for("//*[@class='apartado_iniciativa' and contains(normalize-space(text()),'Tramitación seguida por la iniciativa:')]/following-sibling::*[@class='texto']")
|
42
|
+
closed_at = Date.new($3.to_i, $2.to_i, $1.to_i) if closed_at_text && closed_at_text.match(/Concluido\s+.+\s+desde (\d\d)\/(\d\d)\/(\d\d\d\d)/)
|
43
|
+
|
44
|
+
proposal = {:title => clean_text(title.content),
|
45
|
+
:official_url => "http://www.congreso.es" + title[:href],
|
46
|
+
:proposal_type => proposal_type,
|
47
|
+
:closed_at => closed_at,
|
48
|
+
:official_resolution => resolution,
|
49
|
+
:commission_name => commission_name,
|
50
|
+
:proposer => proposer,
|
51
|
+
:proposed_at => proposed_at}
|
52
|
+
|
53
|
+
progress.inc
|
54
|
+
|
55
|
+
proposals << proposal
|
56
|
+
end
|
57
|
+
|
58
|
+
next_page = results_page.link_with(:text => /Siguiente/)
|
59
|
+
results_page = next_page.nil? ? nil : next_page.click
|
60
|
+
end
|
61
|
+
|
62
|
+
progress.finish
|
63
|
+
|
64
|
+
proposals
|
65
|
+
end
|
66
|
+
|
67
|
+
private
|
68
|
+
|
69
|
+
def text_for(selector)
|
70
|
+
element = @proposal_page.search(selector).first
|
71
|
+
element.nil? ? nil : element.content
|
72
|
+
end
|
73
|
+
|
74
|
+
def clean_text(text)
|
75
|
+
return unless text
|
76
|
+
text.gsub(/\s+/,' ').gsub(/\s*\.\s*$/, '').strip
|
77
|
+
end
|
78
|
+
end
|
79
|
+
end
|