grucrawler 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 2c1969c40c532caad82628f59ac6cd088d069af2
4
- data.tar.gz: 27e7781412201b02ef9356b28f0119a2cef5d69d
3
+ metadata.gz: c5723aa43bafd68af0aa0499c9f748bfca57a580
4
+ data.tar.gz: 3e7046a91e6594c17e987bfb8d934402ee2826dc
5
5
  SHA512:
6
- metadata.gz: 2f8704b18e02eedd2efaa39649a2e61e60bc6cfcd17a8e5e77e28e6e793d4475f5e29d887441cb0f632cd785d03e34e0c661059f810460be190ade23da2a7925
7
- data.tar.gz: e10692a0971c5b427b501018fb85419d2810880225add5f2a7d7358abd4bd82b32f5ce5bc17781c582bdcde61d8429e040755d88d6cad9cacac8b7d2cc5f490c
6
+ metadata.gz: b17e8fdd06816a6e4a1294a9dbf384d95519d1f4046d2d1c4bbf2e4c5821b4839f26e4290e4d655431a1f23beb9fb918c729ae34c6d56c4a6e71e0daf67a6520
7
+ data.tar.gz: 1f253a2fe81b1ae537a0c4de4ed9626261f8f2b72567959839550b883cf41b0c952e164ca1b4ff7a8d1a8378cb7a76a6acfcee9573da48219164069659da67df
data/README.md CHANGED
@@ -2,13 +2,15 @@
2
2
 
3
3
  ```ruby
4
4
  require 'grucrawler'
5
+ require 'colorize'
5
6
 
6
7
  class ItalianCrawler
7
8
  def options
8
9
  {
9
10
  visit_urls_only_once: true,
10
11
  follow_redirects: true,
11
- concurrency: 5
12
+ concurrency: 5,
13
+ domain_wait: 20 # seconds between visits to the same domain
12
14
  }
13
15
  end
14
16
 
@@ -7,13 +7,14 @@ class GruCrawler
7
7
  DOMAIN_VISITS_KEY = 'domain_visits'
8
8
  QUEUE_KEY = 'queue'
9
9
 
10
- def initialize(namespace, visit_once)
10
+ def initialize(namespace, visit_once, domain_wait)
11
11
  @redis = Redis.new
12
12
  @rns = namespace + ':'
13
13
  @concurrent_requests = 0
14
14
  @tmp_block = {}
15
15
  @domains_throttle = Hash.new(0.0)
16
16
  @visit_once = visit_once
17
+ @domain_wait = domain_wait
17
18
  end
18
19
 
19
20
  def reset
@@ -41,15 +42,13 @@ class GruCrawler
41
42
  url
42
43
  end
43
44
 
44
- MIN_TIME_TO_WAIT = 20
45
-
46
45
  def can_visit_now(url)
47
46
  return false if @tmp_block[url]
48
47
 
49
48
  last_visit = last_visit_to_domain(url)
50
49
  time_passed = Time.now.to_f - last_visit
51
50
 
52
- time_passed > MIN_TIME_TO_WAIT
51
+ time_passed > @domain_wait
53
52
  end
54
53
 
55
54
  def started(url)
@@ -1,3 +1,3 @@
1
1
  class GruCrawler
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
data/lib/grucrawler.rb CHANGED
@@ -12,7 +12,8 @@ class GruCrawler
12
12
  def initialize(rules)
13
13
  @crawler = rules
14
14
  @options = @crawler.options()
15
- @queue = GruCrawler::Queue.new(@crawler.class.name, @options[:visit_urls_only_once])
15
+ domain_wait = @options[:domain_wait] || 20
16
+ @queue = GruCrawler::Queue.new(@crawler.class.name, @options[:visit_urls_only_once], domain_wait)
16
17
 
17
18
  @crawler.on_init(self)
18
19
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: grucrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Slava Vishnyakov