coolCrawler 0.4.0 → 0.4.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
 - data/lib/coolCrawler/version.rb +1 -1
 - data/lib/cool_crawler.rb +24 -6
 - metadata +2 -2
 
    
        checksums.yaml
    CHANGED
    
    | 
         @@ -1,7 +1,7 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            ---
         
     | 
| 
       2 
2 
     | 
    
         
             
            SHA256:
         
     | 
| 
       3 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       4 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 3 
     | 
    
         
            +
              metadata.gz: a8c2944180ee7c5d7f1241fd62bf5e89973a2fca742c51567e1550f14f95f70a
         
     | 
| 
      
 4 
     | 
    
         
            +
              data.tar.gz: 6635f39af2babaead932e94cae419738d7bd61e0ecc2f23216cb6102607904bf
         
     | 
| 
       5 
5 
     | 
    
         
             
            SHA512:
         
     | 
| 
       6 
     | 
    
         
            -
              metadata.gz:  
     | 
| 
       7 
     | 
    
         
            -
              data.tar.gz:  
     | 
| 
      
 6 
     | 
    
         
            +
              metadata.gz: 4f56da51ab47060e7d58b3ac469ca15501b2b457a936b634b361afa0f832a27130c74169d7c1da4f3a188f56db58335c59325de261990ee02bbdbd1431ad24da
         
     | 
| 
      
 7 
     | 
    
         
            +
              data.tar.gz: 77dc4e5cdb5d0098ba8f91279bc4d658c653130313fbb49de52c3e2c2b707c4501fcf77a31f2a6a54b0b48ca12e54ab674423ab377e22e043a6e6cce0093a6df
         
     | 
    
        data/lib/coolCrawler/version.rb
    CHANGED
    
    
    
        data/lib/cool_crawler.rb
    CHANGED
    
    | 
         @@ -13,11 +13,13 @@ module CoolCrawler 
     | 
|
| 
       13 
13 
     | 
    
         
             
              # This is the class that handles the queue and async requests
         
     | 
| 
       14 
14 
     | 
    
         
             
              class CrawlerPool
         
     | 
| 
       15 
15 
     | 
    
         | 
| 
       16 
     | 
    
         
            -
                def initialize(start, max_connections, delay)
         
     | 
| 
      
 16 
     | 
    
         
            +
                def initialize(start, max_connections, delay, max_pages=50)
         
     | 
| 
       17 
17 
     | 
    
         
             
                  @uri = URI(start)
         
     | 
| 
      
 18 
     | 
    
         
            +
                  @max_pages = max_pages
         
     | 
| 
       18 
19 
     | 
    
         
             
                  @site = "#{uri.scheme}://#{uri.host}"
         
     | 
| 
       19 
20 
     | 
    
         
             
                  @max_connections = max_connections
         
     | 
| 
       20 
21 
     | 
    
         
             
                  @delay = delay
         
     | 
| 
      
 22 
     | 
    
         
            +
                  @visited_pages = 0
         
     | 
| 
       21 
23 
     | 
    
         
             
                  visited[uri.path] = 1
         
     | 
| 
       22 
24 
     | 
    
         
             
                  queue << uri.path
         
     | 
| 
       23 
25 
     | 
    
         
             
                end
         
     | 
| 
         @@ -49,9 +51,12 @@ module CoolCrawler 
     | 
|
| 
       49 
51 
     | 
    
         
             
                    pages.each do |page|
         
     | 
| 
       50 
52 
     | 
    
         
             
                      barrier.async do
         
     | 
| 
       51 
53 
     | 
    
         
             
                        response = internet.get URI.join(@site, page).to_s
         
     | 
| 
       52 
     | 
    
         
            -
                        body = response.read
         
     | 
| 
       53 
     | 
    
         
            -
                         
     | 
| 
       54 
     | 
    
         
            -
                         
     | 
| 
      
 54 
     | 
    
         
            +
                        body = Nokogiri::HTML(response.read)
         
     | 
| 
      
 55 
     | 
    
         
            +
                        body.search('//img').remove
         
     | 
| 
      
 56 
     | 
    
         
            +
                        body.search('//style').remove
         
     | 
| 
      
 57 
     | 
    
         
            +
                        body.search('//script').remove
         
     | 
| 
      
 58 
     | 
    
         
            +
                        links = gather_links_uri(body.to_s, URI.join(uri, page))
         
     | 
| 
      
 59 
     | 
    
         
            +
                        after(page, links, body.to_s)
         
     | 
| 
       55 
60 
     | 
    
         
             
                        links.each do |link|
         
     | 
| 
       56 
61 
     | 
    
         
             
                          enqueue(link)
         
     | 
| 
       57 
62 
     | 
    
         
             
                          add_to_visited(link)
         
     | 
| 
         @@ -71,7 +76,9 @@ module CoolCrawler 
     | 
|
| 
       71 
76 
     | 
    
         
             
                    next if a["href"].nil?
         
     | 
| 
       72 
77 
     | 
    
         
             
                    uri_a = URI(a["href"].strip.split('#')[0].sub(/\\|(\s+$)/, ""))
         
     | 
| 
       73 
78 
     | 
    
         
             
                    begin
         
     | 
| 
       74 
     | 
    
         
            -
                       
     | 
| 
      
 79 
     | 
    
         
            +
                      if @visited_pages <= @max_pages
         
     | 
| 
      
 80 
     | 
    
         
            +
                        links << URI.join(page, uri_a).path if (uri_a.host == uri.host || uri_a.host.nil?) && uri_a.path
         
     | 
| 
      
 81 
     | 
    
         
            +
                      end
         
     | 
| 
       75 
82 
     | 
    
         
             
                    rescue
         
     | 
| 
       76 
83 
     | 
    
         
             
                      # do nothing
         
     | 
| 
       77 
84 
     | 
    
         
             
                    end
         
     | 
| 
         @@ -97,6 +104,7 @@ module CoolCrawler 
     | 
|
| 
       97 
104 
     | 
    
         
             
                  else
         
     | 
| 
       98 
105 
     | 
    
         
             
                    visited[path] = 1
         
     | 
| 
       99 
106 
     | 
    
         
             
                  end
         
     | 
| 
      
 107 
     | 
    
         
            +
                  @visited_pages += 1
         
     | 
| 
       100 
108 
     | 
    
         
             
                end
         
     | 
| 
       101 
109 
     | 
    
         | 
| 
       102 
110 
     | 
    
         
             
                def sorted_visited
         
     | 
| 
         @@ -104,7 +112,17 @@ module CoolCrawler 
     | 
|
| 
       104 
112 
     | 
    
         
             
                end
         
     | 
| 
       105 
113 
     | 
    
         | 
| 
       106 
114 
     | 
    
         
             
                def enqueue(path)
         
     | 
| 
       107 
     | 
    
         
            -
                   
     | 
| 
      
 115 
     | 
    
         
            +
                  unless visited.include?(path)
         
     | 
| 
      
 116 
     | 
    
         
            +
                    queue << path
         
     | 
| 
      
 117 
     | 
    
         
            +
                  end
         
     | 
| 
      
 118 
     | 
    
         
            +
                end
         
     | 
| 
      
 119 
     | 
    
         
            +
             
     | 
| 
      
 120 
     | 
    
         
            +
                def sum_pages
         
     | 
| 
      
 121 
     | 
    
         
            +
                  sum = 0
         
     | 
| 
      
 122 
     | 
    
         
            +
                  visited.each do |_k, v|
         
     | 
| 
      
 123 
     | 
    
         
            +
                    sum += v
         
     | 
| 
      
 124 
     | 
    
         
            +
                  end
         
     | 
| 
      
 125 
     | 
    
         
            +
                  sum
         
     | 
| 
       108 
126 
     | 
    
         
             
                end
         
     | 
| 
       109 
127 
     | 
    
         
             
              end
         
     | 
| 
       110 
128 
     | 
    
         
             
            end
         
     | 
    
        metadata
    CHANGED
    
    | 
         @@ -1,14 +1,14 @@ 
     | 
|
| 
       1 
1 
     | 
    
         
             
            --- !ruby/object:Gem::Specification
         
     | 
| 
       2 
2 
     | 
    
         
             
            name: coolCrawler
         
     | 
| 
       3 
3 
     | 
    
         
             
            version: !ruby/object:Gem::Version
         
     | 
| 
       4 
     | 
    
         
            -
              version: 0.4. 
     | 
| 
      
 4 
     | 
    
         
            +
              version: 0.4.3
         
     | 
| 
       5 
5 
     | 
    
         
             
            platform: ruby
         
     | 
| 
       6 
6 
     | 
    
         
             
            authors:
         
     | 
| 
       7 
7 
     | 
    
         
             
            - William Wright
         
     | 
| 
       8 
8 
     | 
    
         
             
            autorequire:
         
     | 
| 
       9 
9 
     | 
    
         
             
            bindir: exe
         
     | 
| 
       10 
10 
     | 
    
         
             
            cert_chain: []
         
     | 
| 
       11 
     | 
    
         
            -
            date: 2022- 
     | 
| 
      
 11 
     | 
    
         
            +
            date: 2022-11-01 00:00:00.000000000 Z
         
     | 
| 
       12 
12 
     | 
    
         
             
            dependencies:
         
     | 
| 
       13 
13 
     | 
    
         
             
            - !ruby/object:Gem::Dependency
         
     | 
| 
       14 
14 
     | 
    
         
             
              name: rspec
         
     |