krawler 1.0.1 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. data/lib/krawler/version.rb +1 -1
  2. data/lib/krawler.rb +22 -16
  3. metadata +4 -4
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.1'
2
+ VERSION = '1.0.2'
3
3
  end
data/lib/krawler.rb CHANGED
@@ -3,29 +3,32 @@ require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
5
  require 'thread'
6
+ require 'pry'
6
7
 
7
8
  module Krawler
8
9
 
9
10
  class Base
10
11
 
11
12
  def initialize(url, options)
12
- @url = URI(url)
13
- @host = "#{@url.scheme}://#{@url.host}"
14
- @base_path = @url.path
15
- @links_to_crawl = [@url.to_s]
16
- @crawled_links = []
17
- @bad_links = []
18
- @suspect_links = []
19
- @exclude = options[:exclude]
20
- @restrict = options[:restrict]
21
- @randomize = options[:randomize]
22
- @threads = options[:threads] || 1
23
- @mutex = Mutex.new
24
- @agent = Mechanize.new
13
+ @url = URI(url)
14
+ @host = "#{@url.scheme}://#{@url.host}"
15
+ @base_path = @url.path
16
+ @links_to_crawl = [@url.to_s]
17
+ @crawled_links = []
18
+ @bad_links = []
19
+ @suspect_links = []
20
+ @exclude = options[:exclude]
21
+ @restrict = options[:restrict]
22
+ @randomize = options[:randomize]
23
+ @threads = options[:threads] || 1
24
+ @mutex = Mutex.new
25
+ @agent = Mechanize.new
26
+ @agent.user_agent = 'Krawler'
27
+ @headers = { 'Accept-Encoding' => 'gzip, deflate' }
25
28
  end
26
29
 
27
30
  def base
28
- puts "Crawling..."
31
+ puts "Krawling..."
29
32
 
30
33
  crawl_page(@url, @agent)
31
34
  initialize_threads(@agent)
@@ -68,7 +71,7 @@ module Krawler
68
71
 
69
72
  begin
70
73
  start = Time.now
71
- page = agent.get(link)
74
+ page = agent.get(link, [], nil, @headers)
72
75
  rescue Mechanize::ResponseCodeError => e
73
76
  @mutex.synchronize { puts e }
74
77
  @bad_links << link
@@ -78,8 +81,11 @@ module Krawler
78
81
  return
79
82
  ensure
80
83
  @mutex.synchronize do
84
+ real = Time.now - start
85
+ runtime = page.header['x-runtime'].to_f
86
+ network = (real - runtime).round(10)
81
87
  puts link
82
- puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
88
+ puts " [#{real}s real] [#{runtime}s runtime] [#{network}s network] #{@links_to_crawl.size} links..."
83
89
  end
84
90
  end
85
91
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-28 00:00:00.000000000 Z
12
+ date: 2012-07-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70152297970740 !ruby/object:Gem::Requirement
16
+ requirement: &70223227850320 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70152297970740
24
+ version_requirements: *70223227850320
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com