xupa_emec 1.0.2 → 1.0.3
Sign up to get free protection for your applications and to get access to all the features.
- data/bin/xupa_emec +18 -3
- data/lib/xupa_emec/crawler.rb +14 -3
- data/lib/xupa_emec/version.rb +1 -1
- data/xupa_emec.gemspec +1 -1
- metadata +3 -3
data/bin/xupa_emec
CHANGED
@@ -13,15 +13,20 @@ as opções são:
|
|
13
13
|
EOS
|
14
14
|
opt :entrada, "Arquivo fonte com lista de faculdades exportadas pelo emec", :short => 'i', :default => 'in.xls'
|
15
15
|
opt :saida, "Arquivo csv que será gerado", :short => 'o', :default => 'out.csv'
|
16
|
+
opt :quebraemail, "Gera uma linha por email", :short => 'q'
|
17
|
+
opt :buscacursos, "Busca lista de cursos das IES (demora mais)", :short => 'c'
|
16
18
|
end
|
17
19
|
|
18
|
-
crawler = XupaEmec::Crawler.new
|
20
|
+
crawler = XupaEmec::Crawler.new(:search_courses => opts[:buscacursos])
|
21
|
+
|
22
|
+
headers = ['nome', 'sigla', 'nome_limpo', 'tipo', 'cidade', 'tel', 'site', 'email', 'mantenedora', 'representante_nome', 'representante_primeiro_nome', 'representante_cargo']
|
23
|
+
headers << 'num_cursos' << 'lista_cursos' if opts[:buscacursos]
|
19
24
|
|
20
25
|
File.open(opts[:entrada], "r") do |input|
|
21
26
|
|
22
27
|
FasterCSV.open(opts[:saida], "w",
|
23
28
|
:write_headers => true,
|
24
|
-
:headers =>
|
29
|
+
:headers => headers) do |out_csv|
|
25
30
|
|
26
31
|
in_html = doc = Nokogiri::HTML(input)
|
27
32
|
iess_to_search = in_html.css('table:nth-child(2) tbody tr')
|
@@ -37,8 +42,18 @@ File.open(opts[:entrada], "r") do |input|
|
|
37
42
|
puts
|
38
43
|
puts "#{index+1} - Buscando nome da instituição '#{ies_search_name}'..."
|
39
44
|
|
40
|
-
out_csv << crawler.crawl(ies_search_name)
|
41
45
|
|
46
|
+
if opts[:quebraemail]
|
47
|
+
ies_hash = crawler.crawl(ies_search_name)
|
48
|
+
ies_hash['email'].split(',').each do |email|
|
49
|
+
new_hash = ies_hash.clone
|
50
|
+
new_hash['email'] = email
|
51
|
+
out_csv << new_hash
|
52
|
+
end
|
53
|
+
else
|
54
|
+
out_csv << crawler.crawl(ies_search_name)
|
55
|
+
end
|
56
|
+
|
42
57
|
end
|
43
58
|
|
44
59
|
end
|
data/lib/xupa_emec/crawler.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
module XupaEmec
|
2
2
|
class Crawler
|
3
|
-
def initialize(
|
4
|
-
@
|
3
|
+
def initialize(options={})
|
4
|
+
@search_courses = options[:search_courses]
|
5
|
+
@agent = options[:agent] || Mechanize.new
|
5
6
|
end
|
6
7
|
|
7
8
|
attr_reader :agent
|
@@ -36,6 +37,10 @@ module XupaEmec
|
|
36
37
|
|
37
38
|
ies_info['nome'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(1) > td:nth-child(2)").first.text.strip
|
38
39
|
|
40
|
+
ies_info['sigla'] = ies_info['nome'].split(' - ')[1..-1].join('-')
|
41
|
+
|
42
|
+
ies_info['nome_limpo'] = ies_info['nome'].split(' - ')[0].mb_chars.titleize
|
43
|
+
|
39
44
|
ies_info['cidade'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(5) > td:nth-child(2)").first.text.strip
|
40
45
|
|
41
46
|
ies_info['tel'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(6) > td:nth-child(2)").first.text.strip
|
@@ -44,7 +49,13 @@ module XupaEmec
|
|
44
49
|
|
45
50
|
ies_info['site'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(7) > td:nth-child(4)").first.text.strip
|
46
51
|
|
47
|
-
ies_info['email'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(8) > td:nth-child(2)").first.text.strip
|
52
|
+
ies_info['email'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(8) > td:nth-child(2)").first.text.strip.split(/\s*[\s,;\/\\]\s*/).join(',')
|
53
|
+
|
54
|
+
if @search_courses
|
55
|
+
courses_page= agent.get("http://emec.mec.gov.br/emec/consulta-ies/listar-curso-agrupado/#{ies_url}/page/1/list/1000")
|
56
|
+
ies_info['num_cursos'] = courses_page.search("div.campform > div:first-child").text.match(/Registro\(s\)\: 1 a \d+ de (\d+)/)[1]
|
57
|
+
ies_info['lista_cursos'] = courses_page.search("table#listar-ies-cadastro > tbody > tr").map{|l| l.search('td').first.text.gsub(' ', '').strip}.join(', ')
|
58
|
+
end
|
48
59
|
|
49
60
|
puts "Informação processada para '#{ies_search_name}' :"
|
50
61
|
puts ies_info.to_yaml
|
data/lib/xupa_emec/version.rb
CHANGED
data/xupa_emec.gemspec
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xupa_emec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 1.0.
|
9
|
+
- 3
|
10
|
+
version: 1.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- "Bernardo de P\xC3\xA1dua"
|