coolCrawler 0.4.0 → 0.4.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/lib/coolCrawler/version.rb +1 -1
- data/lib/cool_crawler.rb +16 -2
- metadata +2 -2
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 035e8dbe27d4648d16243fea96c692b4a9bde8973d13df02b66aad3aa742afc3
         | 
| 4 | 
            +
              data.tar.gz: 687985a3d0e391aedc610ecae0329299a36c8edc90704b9f9d92118532d3d7cc
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: 224908b5e8f495063ddc81c178f10e276b2744c8d8b71b16538b3ac2eed70978e686cecc1cbf900d3380a1919ec5e438dd0a1dc559b7d6abbde30e9bd44f819e
         | 
| 7 | 
            +
              data.tar.gz: 0ee4aa741315bdd8a2774ee28b762a521f12489356621efe6cf38fab66898a2210313b7e8f07eb74ebd95492b20b80e6548d84a0333f10ac8c09ef6e96410b53
         | 
    
        data/lib/coolCrawler/version.rb
    CHANGED
    
    
    
        data/lib/cool_crawler.rb
    CHANGED
    
    | @@ -13,11 +13,13 @@ module CoolCrawler | |
| 13 13 | 
             
              # This is the class that handles the queue and async requests
         | 
| 14 14 | 
             
              class CrawlerPool
         | 
| 15 15 |  | 
| 16 | 
            -
                def initialize(start, max_connections, delay)
         | 
| 16 | 
            +
                def initialize(start, max_connections, delay, max_pages=50)
         | 
| 17 17 | 
             
                  @uri = URI(start)
         | 
| 18 | 
            +
                  @max_pages = max_pages
         | 
| 18 19 | 
             
                  @site = "#{uri.scheme}://#{uri.host}"
         | 
| 19 20 | 
             
                  @max_connections = max_connections
         | 
| 20 21 | 
             
                  @delay = delay
         | 
| 22 | 
            +
                  @visited_pages = 0
         | 
| 21 23 | 
             
                  visited[uri.path] = 1
         | 
| 22 24 | 
             
                  queue << uri.path
         | 
| 23 25 | 
             
                end
         | 
| @@ -54,6 +56,7 @@ module CoolCrawler | |
| 54 56 | 
             
                        after(page, links, body)
         | 
| 55 57 | 
             
                        links.each do |link|
         | 
| 56 58 | 
             
                          enqueue(link)
         | 
| 59 | 
            +
                          @visited_pages += 1
         | 
| 57 60 | 
             
                          add_to_visited(link)
         | 
| 58 61 | 
             
                        end
         | 
| 59 62 | 
             
                      end
         | 
| @@ -104,7 +107,18 @@ module CoolCrawler | |
| 104 107 | 
             
                end
         | 
| 105 108 |  | 
| 106 109 | 
             
                def enqueue(path)
         | 
| 107 | 
            -
                   | 
| 110 | 
            +
                  unless visited.include?(path) or @visited_pages > @max_pages
         | 
| 111 | 
            +
                    queue << path
         | 
| 112 | 
            +
                    p queue.size
         | 
| 113 | 
            +
                  end
         | 
| 114 | 
            +
                end
         | 
| 115 | 
            +
             | 
| 116 | 
            +
                def sum_pages
         | 
| 117 | 
            +
                  sum = 0
         | 
| 118 | 
            +
                  visited.each do |_k, v|
         | 
| 119 | 
            +
                    sum += v
         | 
| 120 | 
            +
                  end
         | 
| 121 | 
            +
                  sum
         | 
| 108 122 | 
             
                end
         | 
| 109 123 | 
             
              end
         | 
| 110 124 | 
             
            end
         | 
    
        metadata
    CHANGED
    
    | @@ -1,14 +1,14 @@ | |
| 1 1 | 
             
            --- !ruby/object:Gem::Specification
         | 
| 2 2 | 
             
            name: coolCrawler
         | 
| 3 3 | 
             
            version: !ruby/object:Gem::Version
         | 
| 4 | 
            -
              version: 0.4. | 
| 4 | 
            +
              version: 0.4.2
         | 
| 5 5 | 
             
            platform: ruby
         | 
| 6 6 | 
             
            authors:
         | 
| 7 7 | 
             
            - William Wright
         | 
| 8 8 | 
             
            autorequire:
         | 
| 9 9 | 
             
            bindir: exe
         | 
| 10 10 | 
             
            cert_chain: []
         | 
| 11 | 
            -
            date: 2022-10- | 
| 11 | 
            +
            date: 2022-10-31 00:00:00.000000000 Z
         | 
| 12 12 | 
             
            dependencies:
         | 
| 13 13 | 
             
            - !ruby/object:Gem::Dependency
         | 
| 14 14 | 
             
              name: rspec
         |