xupa_emec 1.0.2 → 1.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- data/bin/xupa_emec +18 -3
- data/lib/xupa_emec/crawler.rb +14 -3
- data/lib/xupa_emec/version.rb +1 -1
- data/xupa_emec.gemspec +1 -1
- metadata +3 -3
    
        data/bin/xupa_emec
    CHANGED
    
    | @@ -13,15 +13,20 @@ as opções são: | |
| 13 13 | 
             
              EOS
         | 
| 14 14 | 
             
              opt :entrada, "Arquivo fonte com lista de faculdades exportadas pelo emec", :short => 'i', :default => 'in.xls'
         | 
| 15 15 | 
             
              opt :saida, "Arquivo csv que será gerado", :short => 'o', :default => 'out.csv'
         | 
| 16 | 
            +
              opt :quebraemail, "Gera uma linha por email", :short => 'q' 
         | 
| 17 | 
            +
              opt :buscacursos, "Busca lista de cursos das IES (demora mais)", :short => 'c' 
         | 
| 16 18 | 
             
            end
         | 
| 17 19 |  | 
| 18 | 
            -
            crawler = XupaEmec::Crawler.new
         | 
| 20 | 
            +
            crawler = XupaEmec::Crawler.new(:search_courses => opts[:buscacursos])
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            headers = ['nome', 'sigla', 'nome_limpo', 'tipo', 'cidade', 'tel', 'site', 'email', 'mantenedora', 'representante_nome', 'representante_primeiro_nome', 'representante_cargo']
         | 
| 23 | 
            +
            headers << 'num_cursos' << 'lista_cursos' if opts[:buscacursos]
         | 
| 19 24 |  | 
| 20 25 | 
             
            File.open(opts[:entrada], "r") do |input|
         | 
| 21 26 |  | 
| 22 27 | 
             
              FasterCSV.open(opts[:saida], "w", 
         | 
| 23 28 | 
             
                :write_headers => true,
         | 
| 24 | 
            -
                :headers =>  | 
| 29 | 
            +
                :headers => headers) do |out_csv|
         | 
| 25 30 |  | 
| 26 31 | 
             
                in_html = doc = Nokogiri::HTML(input)
         | 
| 27 32 | 
             
                iess_to_search = in_html.css('table:nth-child(2) tbody tr')
         | 
| @@ -37,8 +42,18 @@ File.open(opts[:entrada], "r") do |input| | |
| 37 42 | 
             
                  puts
         | 
| 38 43 | 
             
                  puts "#{index+1} - Buscando nome da instituição '#{ies_search_name}'..."
         | 
| 39 44 |  | 
| 40 | 
            -
                  out_csv << crawler.crawl(ies_search_name)
         | 
| 41 45 |  | 
| 46 | 
            +
                  if opts[:quebraemail]
         | 
| 47 | 
            +
                    ies_hash = crawler.crawl(ies_search_name)
         | 
| 48 | 
            +
                    ies_hash['email'].split(',').each do |email|
         | 
| 49 | 
            +
                      new_hash = ies_hash.clone
         | 
| 50 | 
            +
                      new_hash['email'] = email
         | 
| 51 | 
            +
                      out_csv << new_hash
         | 
| 52 | 
            +
                    end
         | 
| 53 | 
            +
                  else
         | 
| 54 | 
            +
                    out_csv << crawler.crawl(ies_search_name)
         | 
| 55 | 
            +
                  end
         | 
| 56 | 
            +
                  
         | 
| 42 57 | 
             
                end
         | 
| 43 58 |  | 
| 44 59 | 
             
              end
         | 
    
        data/lib/xupa_emec/crawler.rb
    CHANGED
    
    | @@ -1,7 +1,8 @@ | |
| 1 1 | 
             
            module XupaEmec
         | 
| 2 2 | 
             
              class Crawler
         | 
| 3 | 
            -
                def initialize( | 
| 4 | 
            -
                  @ | 
| 3 | 
            +
                def initialize(options={})
         | 
| 4 | 
            +
                  @search_courses = options[:search_courses]
         | 
| 5 | 
            +
                  @agent = options[:agent] || Mechanize.new
         | 
| 5 6 | 
             
                end
         | 
| 6 7 |  | 
| 7 8 | 
             
                attr_reader :agent
         | 
| @@ -36,6 +37,10 @@ module XupaEmec | |
| 36 37 |  | 
| 37 38 | 
             
                  ies_info['nome'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(1) > td:nth-child(2)").first.text.strip
         | 
| 38 39 |  | 
| 40 | 
            +
                  ies_info['sigla'] = ies_info['nome'].split(' - ')[1..-1].join('-')
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                  ies_info['nome_limpo'] = ies_info['nome'].split(' - ')[0].mb_chars.titleize
         | 
| 43 | 
            +
             | 
| 39 44 | 
             
                  ies_info['cidade'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(5) > td:nth-child(2)").first.text.strip
         | 
| 40 45 |  | 
| 41 46 | 
             
                  ies_info['tel'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(6) > td:nth-child(2)").first.text.strip
         | 
| @@ -44,7 +49,13 @@ module XupaEmec | |
| 44 49 |  | 
| 45 50 | 
             
                  ies_info['site'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(7) > td:nth-child(4)").first.text.strip
         | 
| 46 51 |  | 
| 47 | 
            -
                  ies_info['email'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(8) > td:nth-child(2)").first.text.strip
         | 
| 52 | 
            +
                  ies_info['email'] = ies_data.search("table.tab_paleta > tr:nth-child(4) tr:nth-child(8) > td:nth-child(2)").first.text.strip.split(/\s*[\s,;\/\\]\s*/).join(',')
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                  if @search_courses
         | 
| 55 | 
            +
                    courses_page= agent.get("http://emec.mec.gov.br/emec/consulta-ies/listar-curso-agrupado/#{ies_url}/page/1/list/1000")
         | 
| 56 | 
            +
                    ies_info['num_cursos'] = courses_page.search("div.campform > div:first-child").text.match(/Registro\(s\)\: 1 a \d+ de (\d+)/)[1]
         | 
| 57 | 
            +
                    ies_info['lista_cursos'] = courses_page.search("table#listar-ies-cadastro > tbody > tr").map{|l| l.search('td').first.text.gsub(' ', '').strip}.join(', ')
         | 
| 58 | 
            +
                  end      
         | 
| 48 59 |  | 
| 49 60 | 
             
                  puts "Informação processada para '#{ies_search_name}' :"
         | 
| 50 61 | 
             
                  puts ies_info.to_yaml
         | 
    
        data/lib/xupa_emec/version.rb
    CHANGED
    
    
    
        data/xupa_emec.gemspec
    CHANGED
    
    
    
        metadata
    CHANGED
    
    | @@ -1,13 +1,13 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification 
         | 
| 2 2 | 
             
            name: xupa_emec
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version 
         | 
| 4 | 
            -
              hash:  | 
| 4 | 
            +
              hash: 17
         | 
| 5 5 | 
             
              prerelease: false
         | 
| 6 6 | 
             
              segments: 
         | 
| 7 7 | 
             
              - 1
         | 
| 8 8 | 
             
              - 0
         | 
| 9 | 
            -
              -  | 
| 10 | 
            -
              version: 1.0. | 
| 9 | 
            +
              - 3
         | 
| 10 | 
            +
              version: 1.0.3
         | 
| 11 11 | 
             
            platform: ruby
         | 
| 12 12 | 
             
            authors: 
         | 
| 13 13 | 
             
            - "Bernardo de P\xC3\xA1dua"
         |