xupa_emec 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/xupa_emec +18 -3
- data/lib/xupa_emec/crawler.rb +14 -3
- data/lib/xupa_emec/version.rb +1 -1
- data/xupa_emec.gemspec +1 -1
- metadata +3 -3
data/bin/xupa_emec
CHANGED
@@ -13,15 +13,20 @@ as opções são:
|
|
13
13
|
EOS
|
14
14
|
opt :entrada, "Arquivo fonte com lista de faculdades exportadas pelo emec", :short => 'i', :default => 'in.xls'
|
15
15
|
opt :saida, "Arquivo csv que será gerado", :short => 'o', :default => 'out.csv'
|
16
|
+
opt :quebraemail, "Gera uma linha por email", :short => 'q'
|
17
|
+
opt :buscacursos, "Busca lista de cursos das IES (demora mais)", :short => 'c'
|
16
18
|
end
|
17
19
|
|
18
|
-
crawler = XupaEmec::Crawler.new
|
20
|
+
crawler = XupaEmec::Crawler.new(:search_courses => opts[:buscacursos])
|
21
|
+
|
22
|
+
headers = ['nome', 'sigla', 'nome_limpo', 'tipo', 'cidade', 'tel', 'site', 'email', 'mantenedora', 'representante_nome', 'representante_primeiro_nome', 'representante_cargo']
|
23
|
+
headers << 'num_cursos' << 'lista_cursos' if opts[:buscacursos]
|
19
24
|
|
20
25
|
File.open(opts[:entrada], "r") do |input|
|
21
26
|
|
22
27
|
FasterCSV.open(opts[:saida], "w",
|
23
28
|
:write_headers => true,
|
24
|
-
:headers =>
|
29
|
+
:headers => headers) do |out_csv|
|
25
30
|
|
26
31
|
in_html = doc = Nokogiri::HTML(input)
|
27
32
|
iess_to_search = in_html.css('table:nth-child(2) tbody tr')
|
@@ -37,8 +42,18 @@ File.open(opts[:entrada], "r") do |input|
|
|
37
42
|
puts
|
38
43
|
puts "#{index+1} - Buscando nome da instituição '#{ies_search_name}'..."
|
39
44
|
|
40
|
-
out_csv << crawler.crawl(ies_search_name)
|
41
45
|
|
46
|
+
if opts[:quebraemail]
|
47
|
+
ies_hash = crawler.crawl(ies_search_name)
|
48
|
+
ies_hash['email'].split(',').each do |email|
|
49
|
+
new_hash = ies_hash.clone
|
50
|
+
new_hash['email'] = email
|
51
|
+
out_csv << new_hash
|
52
|
+
end
|
53
|
+
else
|
54
|
+
out_csv << crawler.crawl(ies_search_name)
|
55
|
+
end
|
56
|
+
|
42
57
|
end
|
43
58
|
|
44
59
|
end
|
data/lib/xupa_emec/crawler.rb
CHANGED
@@ -1,7 +1,8 @@
|
|
1
1
|
module XupaEmec
|
2
2
|
class Crawler
|
3
|
-
def initialize(
|
4
|
-
@
|
3
|
+
def initialize(options={})
|
4
|
+
@search_courses = options[:search_courses]
|
5
|
+
@agent = options[:agent] || Mechanize.new
|
5
6
|
end
|
6
7
|
|
7
8
|
attr_reader :agent
|
@@ -36,6 +37,10 @@ module XupaEmec
|
|
36
37
|
|
37
38
|
ies_info['nome'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(1) > td:nth-child(2)").first.text.strip
|
38
39
|
|
40
|
+
ies_info['sigla'] = ies_info['nome'].split(' - ')[1..-1].join('-')
|
41
|
+
|
42
|
+
ies_info['nome_limpo'] = ies_info['nome'].split(' - ')[0].mb_chars.titleize
|
43
|
+
|
39
44
|
ies_info['cidade'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(5) > td:nth-child(2)").first.text.strip
|
40
45
|
|
41
46
|
ies_info['tel'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(6) > td:nth-child(2)").first.text.strip
|
@@ -44,7 +49,13 @@ module XupaEmec
|
|
44
49
|
|
45
50
|
ies_info['site'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(7) > td:nth-child(4)").first.text.strip
|
46
51
|
|
47
|
-
ies_info['email'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(8) > td:nth-child(2)").first.text.strip
|
52
|
+
ies_info['email'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(8) > td:nth-child(2)").first.text.strip.split(/\s*[\s,;\/\\]\s*/).join(',')
|
53
|
+
|
54
|
+
if @search_courses
|
55
|
+
courses_page= agent.get("http://emec.mec.gov.br/emec/consulta-ies/listar-curso-agrupado/#{ies_url}/page/1/list/1000")
|
56
|
+
ies_info['num_cursos'] = courses_page.search("div.campform > div:first-child").text.match(/Registro\(s\)\: 1 a \d+ de (\d+)/)[1]
|
57
|
+
ies_info['lista_cursos'] = courses_page.search("table#listar-ies-cadastro > tbody > tr").map{|l| l.search('td').first.text.gsub(' ', '').strip}.join(', ')
|
58
|
+
end
|
48
59
|
|
49
60
|
puts "Informação processada para '#{ies_search_name}' :"
|
50
61
|
puts ies_info.to_yaml
|
data/lib/xupa_emec/version.rb
CHANGED
data/xupa_emec.gemspec
CHANGED
metadata
CHANGED
@@ -1,13 +1,13 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: xupa_emec
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
hash:
|
4
|
+
hash: 17
|
5
5
|
prerelease: false
|
6
6
|
segments:
|
7
7
|
- 1
|
8
8
|
- 0
|
9
|
-
-
|
10
|
-
version: 1.0.
|
9
|
+
- 3
|
10
|
+
version: 1.0.3
|
11
11
|
platform: ruby
|
12
12
|
authors:
|
13
13
|
- "Bernardo de P\xC3\xA1dua"
|