krawler 1.0.1 → 1.0.2

Sign up to get free protection for your applications and to get access to all the features.
Files changed (3) hide show
  1. data/lib/krawler/version.rb +1 -1
  2. data/lib/krawler.rb +22 -16
  3. metadata +4 -4
@@ -1,3 +1,3 @@
1
1
  module Krawler
2
- VERSION = '1.0.1'
2
+ VERSION = '1.0.2'
3
3
  end
data/lib/krawler.rb CHANGED
@@ -3,29 +3,32 @@ require 'mechanize'
3
3
  require 'timeout'
4
4
  require 'uri'
5
5
  require 'thread'
6
+ require 'pry'
6
7
 
7
8
  module Krawler
8
9
 
9
10
  class Base
10
11
 
11
12
  def initialize(url, options)
12
- @url = URI(url)
13
- @host = "#{@url.scheme}://#{@url.host}"
14
- @base_path = @url.path
15
- @links_to_crawl = [@url.to_s]
16
- @crawled_links = []
17
- @bad_links = []
18
- @suspect_links = []
19
- @exclude = options[:exclude]
20
- @restrict = options[:restrict]
21
- @randomize = options[:randomize]
22
- @threads = options[:threads] || 1
23
- @mutex = Mutex.new
24
- @agent = Mechanize.new
13
+ @url = URI(url)
14
+ @host = "#{@url.scheme}://#{@url.host}"
15
+ @base_path = @url.path
16
+ @links_to_crawl = [@url.to_s]
17
+ @crawled_links = []
18
+ @bad_links = []
19
+ @suspect_links = []
20
+ @exclude = options[:exclude]
21
+ @restrict = options[:restrict]
22
+ @randomize = options[:randomize]
23
+ @threads = options[:threads] || 1
24
+ @mutex = Mutex.new
25
+ @agent = Mechanize.new
26
+ @agent.user_agent = 'Krawler'
27
+ @headers = { 'Accept-Encoding' => 'gzip, deflate' }
25
28
  end
26
29
 
27
30
  def base
28
- puts "Crawling..."
31
+ puts "Krawling..."
29
32
 
30
33
  crawl_page(@url, @agent)
31
34
  initialize_threads(@agent)
@@ -68,7 +71,7 @@ module Krawler
68
71
 
69
72
  begin
70
73
  start = Time.now
71
- page = agent.get(link)
74
+ page = agent.get(link, [], nil, @headers)
72
75
  rescue Mechanize::ResponseCodeError => e
73
76
  @mutex.synchronize { puts e }
74
77
  @bad_links << link
@@ -78,8 +81,11 @@ module Krawler
78
81
  return
79
82
  ensure
80
83
  @mutex.synchronize do
84
+ real = Time.now - start
85
+ runtime = page.header['x-runtime'].to_f
86
+ network = (real - runtime).round(10)
81
87
  puts link
82
- puts " [#{Time.now - start}s] #{@links_to_crawl.size} links..."
88
+ puts " [#{real}s real] [#{runtime}s runtime] [#{network}s network] #{@links_to_crawl.size} links..."
83
89
  end
84
90
  end
85
91
 
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: krawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 1.0.1
4
+ version: 1.0.2
5
5
  prerelease:
6
6
  platform: ruby
7
7
  authors:
@@ -9,11 +9,11 @@ authors:
9
9
  autorequire:
10
10
  bindir: bin
11
11
  cert_chain: []
12
- date: 2012-06-28 00:00:00.000000000 Z
12
+ date: 2012-07-05 00:00:00.000000000 Z
13
13
  dependencies:
14
14
  - !ruby/object:Gem::Dependency
15
15
  name: mechanize
16
- requirement: &70152297970740 !ruby/object:Gem::Requirement
16
+ requirement: &70223227850320 !ruby/object:Gem::Requirement
17
17
  none: false
18
18
  requirements:
19
19
  - - ~>
@@ -21,7 +21,7 @@ dependencies:
21
21
  version: 2.5.0
22
22
  type: :runtime
23
23
  prerelease: false
24
- version_requirements: *70152297970740
24
+ version_requirements: *70223227850320
25
25
  description: Simple little website crawler.
26
26
  email:
27
27
  - mike@urlgonomics.com