generalscraper 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/lib/generalscraper.rb +78 -0
- metadata +45 -0
    
        checksums.yaml
    ADDED
    
    | @@ -0,0 +1,7 @@ | |
| 1 | 
            +
            ---
         | 
| 2 | 
            +
            SHA1:
         | 
| 3 | 
            +
              metadata.gz: 1e568d1e9d1c0fa9c98f814128439bd009194c0d
         | 
| 4 | 
            +
              data.tar.gz: f26d58116e50b61ad308eed4df0ce03e76cbfba8
         | 
| 5 | 
            +
            SHA512:
         | 
| 6 | 
            +
              metadata.gz: 92120914fbe80f8a0b5b3b5b996753b07a218745a88d7cd201fb533fa3de4e116499c1cc2d218d030771bf3d25dbf42743efc83ed7f1d2e4cb44939229ec31a6
         | 
| 7 | 
            +
              data.tar.gz: b7d4070620f7a43ff7008ae414ea3aed53da287c697f32cb80f6afbd66186d94151a7393ac9547428f9ddeabf37e3500e7e55165ad11f7d2e1f994975e539f96
         | 
| @@ -0,0 +1,78 @@ | |
| 1 | 
            +
            require 'mechanize'
         | 
| 2 | 
            +
            require 'json'
         | 
| 3 | 
            +
            require 'nokogiri'
         | 
| 4 | 
            +
            require 'open-uri'
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            class GeneralScraper
         | 
| 7 | 
            +
              def initialize(scrapesite, input)
         | 
| 8 | 
            +
               @input = input
         | 
| 9 | 
            +
               @scrapesite = scrapesite
         | 
| 10 | 
            +
               @output = Array.new
         | 
| 11 | 
            +
               @startindex = 10
         | 
| 12 | 
            +
              end
         | 
| 13 | 
            +
             | 
| 14 | 
            +
              # Searches for links on Google
         | 
| 15 | 
            +
              def search
         | 
| 16 | 
            +
                agent = Mechanize.new
         | 
| 17 | 
            +
                agent.user_agent_alias = 'Linux Firefox'
         | 
| 18 | 
            +
                gform = agent.get("http://google.com").form("f")
         | 
| 19 | 
            +
                gform.q = "site:" + @scrapesite + " " + @input
         | 
| 20 | 
            +
                page = agent.submit(gform, gform.buttons.first)
         | 
| 21 | 
            +
                examine(page)
         | 
| 22 | 
            +
              end
         | 
| 23 | 
            +
             
         | 
| 24 | 
            +
              # Examines a search page
         | 
| 25 | 
            +
              def examine(page)
         | 
| 26 | 
            +
                page.links.each do |link|
         | 
| 27 | 
            +
                  if (link.href.include? @scrapesite) && (!link.href.include? "webcache") && (!link.href.include? "site:"+@scrapesite)
         | 
| 28 | 
            +
                    saveurl = link.href.split("?q=")
         | 
| 29 | 
            +
                    
         | 
| 30 | 
            +
                    if saveurl[1]
         | 
| 31 | 
            +
                      url = saveurl[1].split("&")
         | 
| 32 | 
            +
                      getPage(url[0])
         | 
| 33 | 
            +
                    end
         | 
| 34 | 
            +
                  end
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                  if (link.href.include? "&sa=N") && (link.href.include? "&start=")
         | 
| 37 | 
            +
                    url1 = link.href.split("&start=")
         | 
| 38 | 
            +
                    url2 = url1[1].split("&sa=N")
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    if url2[0].to_i == @startindex
         | 
| 41 | 
            +
                      sleep(rand(30..90))
         | 
| 42 | 
            +
                      @startindex += 10
         | 
| 43 | 
            +
                      agent = Mechanize.new
         | 
| 44 | 
            +
                      examine(agent.get("http://google.com" + link.href))
         | 
| 45 | 
            +
                    end
         | 
| 46 | 
            +
                  end
         | 
| 47 | 
            +
                end
         | 
| 48 | 
            +
              end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
              # Scrape the page content
         | 
| 51 | 
            +
              def getPage(url)
         | 
| 52 | 
            +
                pagehash = Hash.new
         | 
| 53 | 
            +
                begin
         | 
| 54 | 
            +
                  url.gsub!("%3F", "?")
         | 
| 55 | 
            +
                  url.gsub!("%3D", "=")
         | 
| 56 | 
            +
                  pagehash[:url] = url
         | 
| 57 | 
            +
                  pagehash[:date_retrieved] = Time.now
         | 
| 58 | 
            +
                  html = Nokogiri::HTML(open(url))
         | 
| 59 | 
            +
                  pagehash[:title] = html.css("title").text
         | 
| 60 | 
            +
                  html.css("meta").each do |m|
         | 
| 61 | 
            +
                    if m
         | 
| 62 | 
            +
                      pagehash[m['name']] = m['content']
         | 
| 63 | 
            +
                    end
         | 
| 64 | 
            +
                  end
         | 
| 65 | 
            +
                  pagehash[:page] = html.css("body").text
         | 
| 66 | 
            +
                  @output.push(pagehash)
         | 
| 67 | 
            +
                rescue
         | 
| 68 | 
            +
                  puts "URL: " + url
         | 
| 69 | 
            +
                end
         | 
| 70 | 
            +
              end
         | 
| 71 | 
            +
             | 
| 72 | 
            +
              # Gets all data and returns in JSON
         | 
| 73 | 
            +
              def getData
         | 
| 74 | 
            +
                search
         | 
| 75 | 
            +
                return JSON.pretty_generate(@output)
         | 
| 76 | 
            +
              end
         | 
| 77 | 
            +
            end
         | 
| 78 | 
            +
             | 
    
        metadata
    ADDED
    
    | @@ -0,0 +1,45 @@ | |
| 1 | 
            +
            --- !ruby/object:Gem::Specification
         | 
| 2 | 
            +
            name: generalscraper
         | 
| 3 | 
            +
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            +
              version: 0.0.1
         | 
| 5 | 
            +
            platform: ruby
         | 
| 6 | 
            +
            authors:
         | 
| 7 | 
            +
            - M. C. McGrath
         | 
| 8 | 
            +
            autorequire: 
         | 
| 9 | 
            +
            bindir: bin
         | 
| 10 | 
            +
            cert_chain: []
         | 
| 11 | 
            +
            date: 2014-05-15 00:00:00.000000000 Z
         | 
| 12 | 
            +
            dependencies: []
         | 
| 13 | 
            +
            description: Scrapes all pages on a site you specify including terms you specify.
         | 
| 14 | 
            +
            email: shidash@shidash.com
         | 
| 15 | 
            +
            executables: []
         | 
| 16 | 
            +
            extensions: []
         | 
| 17 | 
            +
            extra_rdoc_files: []
         | 
| 18 | 
            +
            files:
         | 
| 19 | 
            +
            - lib/generalscraper.rb
         | 
| 20 | 
            +
            homepage: https://github.com/TransparencyToolkit/generalscraper
         | 
| 21 | 
            +
            licenses:
         | 
| 22 | 
            +
            - GPL
         | 
| 23 | 
            +
            metadata: {}
         | 
| 24 | 
            +
            post_install_message: 
         | 
| 25 | 
            +
            rdoc_options: []
         | 
| 26 | 
            +
            require_paths:
         | 
| 27 | 
            +
            - lib
         | 
| 28 | 
            +
            required_ruby_version: !ruby/object:Gem::Requirement
         | 
| 29 | 
            +
              requirements:
         | 
| 30 | 
            +
              - - '>='
         | 
| 31 | 
            +
                - !ruby/object:Gem::Version
         | 
| 32 | 
            +
                  version: '0'
         | 
| 33 | 
            +
            required_rubygems_version: !ruby/object:Gem::Requirement
         | 
| 34 | 
            +
              requirements:
         | 
| 35 | 
            +
              - - '>='
         | 
| 36 | 
            +
                - !ruby/object:Gem::Version
         | 
| 37 | 
            +
                  version: '0'
         | 
| 38 | 
            +
            requirements: []
         | 
| 39 | 
            +
            rubyforge_project: 
         | 
| 40 | 
            +
            rubygems_version: 2.0.14
         | 
| 41 | 
            +
            signing_key: 
         | 
| 42 | 
            +
            specification_version: 4
         | 
| 43 | 
            +
            summary: Get all pages on a site for terms specified
         | 
| 44 | 
            +
            test_files: []
         | 
| 45 | 
            +
            has_rdoc: 
         |