indeedcrawler 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/indeedcrawler.rb +33 -5
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA1:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: 8d72ad14e0778839a3c4240834a4c10ebb94111e
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: e5a4bc28084ad26a63d0814c868eb18a6ba4565d
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 9d27cfc99eae6badb5643454a3b4b45192720d9e9df3530dbfe4bc2b766b84fb1dd414082a66c2d2cb777c634e6de542f19e9e69551a099f9ce95e182372b5b1
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: a4af2a1ed5dad981de77f48fccaa9025f2ceae7f3d59ad487f4698f89c3b10d523890b0552f3a9e86bf8942dc7526341938d3863da066330893a30480209071f
         
     | 
    
        data/lib/indeedcrawler.rb
    CHANGED
    
    | 
         @@ -3,9 +3,10 @@ require 'uri' 
     | 
|
| 
       3 
3 
     | 
    
         
             
            require 'requestmanager'
         
     | 
| 
       4 
4 
     | 
    
         
             
            require 'nokogiri'
         
     | 
| 
       5 
5 
     | 
    
         
             
            require 'indeedparser'
         
     | 
| 
      
 6 
     | 
    
         
            +
            require 'curb'
         
     | 
| 
       6 
7 
     | 
    
         | 
| 
       7 
8 
     | 
    
         
             
            class IndeedCrawler
         
     | 
| 
       8 
     | 
    
         
            -
              def initialize(search_query, location, proxy_list, wait_time, browser_num)
         
     | 
| 
      
 9 
     | 
    
         
            +
              def initialize(search_query, location, proxy_list, wait_time, browser_num, cm_hash)
         
     | 
| 
       9 
10 
     | 
    
         
             
                # Info for query
         
     | 
| 
       10 
11 
     | 
    
         
             
                @search_query = search_query
         
     | 
| 
       11 
12 
     | 
    
         
             
                @location = location
         
     | 
| 
         @@ -16,6 +17,10 @@ class IndeedCrawler 
     | 
|
| 
       16 
17 
     | 
    
         
             
                # Result tracking
         
     | 
| 
       17 
18 
     | 
    
         
             
                @all_resume_links = Array.new
         
     | 
| 
       18 
19 
     | 
    
         
             
                @output = Array.new
         
     | 
| 
      
 20 
     | 
    
         
            +
             
     | 
| 
      
 21 
     | 
    
         
            +
                # Handle crawler manager info
         
     | 
| 
      
 22 
     | 
    
         
            +
                @cm_url = cm_hash[:crawler_manager_url] if cm_hash
         
     | 
| 
      
 23 
     | 
    
         
            +
                @selector_id = cm_hash[:selector_id] if cm_hash
         
     | 
| 
       19 
24 
     | 
    
         
             
              end
         
     | 
| 
       20 
25 
     | 
    
         | 
| 
       21 
26 
     | 
    
         
             
              # Append query
         
     | 
| 
         @@ -70,15 +75,38 @@ class IndeedCrawler 
     | 
|
| 
       70 
75 
     | 
    
         
             
                    # Parse resume and add to results
         
     | 
| 
       71 
76 
     | 
    
         
             
                    i = IndeedParser.new(resume, link, {time_scraped: Time.now})
         
     | 
| 
       72 
77 
     | 
    
         
             
                    results = JSON.parse(i.get_results_by_job)
         
     | 
| 
       73 
     | 
    
         
            -
             
     | 
| 
       74 
     | 
    
         
            -
                    results.each do |result|
         
     | 
| 
       75 
     | 
    
         
            -
                      @output.push(result)
         
     | 
| 
       76 
     | 
    
         
            -
                    end
         
     | 
| 
      
 78 
     | 
    
         
            +
                    report_results(results, link)
         
     | 
| 
       77 
79 
     | 
    
         
             
                  rescue
         
     | 
| 
      
 80 
     | 
    
         
            +
                    
         
     | 
| 
       78 
81 
     | 
    
         
             
                  end
         
     | 
| 
       79 
82 
     | 
    
         
             
                end
         
     | 
| 
       80 
83 
     | 
    
         
             
              end
         
     | 
| 
       81 
84 
     | 
    
         | 
| 
      
 85 
     | 
    
         
            +
              # Figure out how to report results
         
     | 
| 
      
 86 
     | 
    
         
            +
              def report_results(results, link)
         
     | 
| 
      
 87 
     | 
    
         
            +
                if @cm_url
         
     | 
| 
      
 88 
     | 
    
         
            +
                  report_incremental(results, link)
         
     | 
| 
      
 89 
     | 
    
         
            +
                else
         
     | 
| 
      
 90 
     | 
    
         
            +
                  report_batch(results)
         
     | 
| 
      
 91 
     | 
    
         
            +
                end
         
     | 
| 
      
 92 
     | 
    
         
            +
              end
         
     | 
| 
      
 93 
     | 
    
         
            +
             
     | 
| 
      
 94 
     | 
    
         
            +
              # Report all results in one JSON
         
     | 
| 
      
 95 
     | 
    
         
            +
              def report_batch(results)
         
     | 
| 
      
 96 
     | 
    
         
            +
                results.each do |result|
         
     | 
| 
      
 97 
     | 
    
         
            +
                  @output.push(result)
         
     | 
| 
      
 98 
     | 
    
         
            +
                end
         
     | 
| 
      
 99 
     | 
    
         
            +
              end
         
     | 
| 
      
 100 
     | 
    
         
            +
             
     | 
| 
      
 101 
     | 
    
         
            +
              # Report results back to Harvester incrementally
         
     | 
| 
      
 102 
     | 
    
         
            +
              def report_incremental(results, link)
         
     | 
| 
      
 103 
     | 
    
         
            +
                curl_url = @cm_url+"/relay_results"
         
     | 
| 
      
 104 
     | 
    
         
            +
                c = Curl::Easy.http_post(curl_url,
         
     | 
| 
      
 105 
     | 
    
         
            +
                                         Curl::PostField.content('selector_id', @selector_id),
         
     | 
| 
      
 106 
     | 
    
         
            +
                                         Curl::PostField.content('status_message', "Collected " + link),
         
     | 
| 
      
 107 
     | 
    
         
            +
                                         Curl::PostField.content('results', JSON.pretty_generate(results)))
         
     | 
| 
      
 108 
     | 
    
         
            +
              end
         
     | 
| 
      
 109 
     | 
    
         
            +
             
     | 
| 
       82 
110 
     | 
    
         
             
              # Get all the profile links
         
     | 
| 
       83 
111 
     | 
    
         
             
              def collect_it_all
         
     | 
| 
       84 
112 
     | 
    
         
             
                # Generate URL
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: indeedcrawler
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.0. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.0.3
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - M. C. McGrath
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire: 
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: bin
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date:  
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2016-10-05 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies: []
         
     | 
| 
       13 
13 
     | 
    
         
             
            description: Crawls Indeed resumes
         
     | 
| 
       14 
14 
     | 
    
         
             
            email: shidash@transparencytoolkit.org
         
     |