krawler 1.0.1 → 1.0.2
Sign up to get free protection for your applications and to get access to all the features.
- data/lib/krawler/version.rb +1 -1
- data/lib/krawler.rb +22 -16
- metadata +4 -4
data/lib/krawler/version.rb
CHANGED
data/lib/krawler.rb
CHANGED
@@ -3,29 +3,32 @@ require 'mechanize'
|
|
3
3
|
require 'timeout'
|
4
4
|
require 'uri'
|
5
5
|
require 'thread'
|
6
|
+
require 'pry'
|
6
7
|
|
7
8
|
module Krawler
|
8
9
|
|
9
10
|
class Base
|
10
11
|
|
11
12
|
def initialize(url, options)
|
12
|
-
@url
|
13
|
-
@host
|
14
|
-
@base_path
|
15
|
-
@links_to_crawl
|
16
|
-
@crawled_links
|
17
|
-
@bad_links
|
18
|
-
@suspect_links
|
19
|
-
@exclude
|
20
|
-
@restrict
|
21
|
-
@randomize
|
22
|
-
@threads
|
23
|
-
@mutex
|
24
|
-
@agent
|
13
|
+
@url = URI(url)
|
14
|
+
@host = "#{@url.scheme}://#{@url.host}"
|
15
|
+
@base_path = @url.path
|
16
|
+
@links_to_crawl = [@url.to_s]
|
17
|
+
@crawled_links = []
|
18
|
+
@bad_links = []
|
19
|
+
@suspect_links = []
|
20
|
+
@exclude = options[:exclude]
|
21
|
+
@restrict = options[:restrict]
|
22
|
+
@randomize = options[:randomize]
|
23
|
+
@threads = options[:threads] || 1
|
24
|
+
@mutex = Mutex.new
|
25
|
+
@agent = Mechanize.new
|
26
|
+
@agent.user_agent = 'Krawler'
|
27
|
+
@headers = { 'Accept-Encoding' => 'gzip, deflate' }
|
25
28
|
end
|
26
29
|
|
27
30
|
def base
|
28
|
-
puts "
|
31
|
+
puts "Krawling..."
|
29
32
|
|
30
33
|
crawl_page(@url, @agent)
|
31
34
|
initialize_threads(@agent)
|
@@ -68,7 +71,7 @@ module Krawler
|
|
68
71
|
|
69
72
|
begin
|
70
73
|
start = Time.now
|
71
|
-
page = agent.get(link)
|
74
|
+
page = agent.get(link, [], nil, @headers)
|
72
75
|
rescue Mechanize::ResponseCodeError => e
|
73
76
|
@mutex.synchronize { puts e }
|
74
77
|
@bad_links << link
|
@@ -78,8 +81,11 @@ module Krawler
|
|
78
81
|
return
|
79
82
|
ensure
|
80
83
|
@mutex.synchronize do
|
84
|
+
real = Time.now - start
|
85
|
+
runtime = page.header['x-runtime'].to_f
|
86
|
+
network = (real - runtime).round(10)
|
81
87
|
puts link
|
82
|
-
puts " [#{
|
88
|
+
puts " [#{real}s real] [#{runtime}s runtime] [#{network}s network] #{@links_to_crawl.size} links..."
|
83
89
|
end
|
84
90
|
end
|
85
91
|
|
metadata
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: krawler
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 1.0.
|
4
|
+
version: 1.0.2
|
5
5
|
prerelease:
|
6
6
|
platform: ruby
|
7
7
|
authors:
|
@@ -9,11 +9,11 @@ authors:
|
|
9
9
|
autorequire:
|
10
10
|
bindir: bin
|
11
11
|
cert_chain: []
|
12
|
-
date: 2012-
|
12
|
+
date: 2012-07-05 00:00:00.000000000 Z
|
13
13
|
dependencies:
|
14
14
|
- !ruby/object:Gem::Dependency
|
15
15
|
name: mechanize
|
16
|
-
requirement: &
|
16
|
+
requirement: &70223227850320 !ruby/object:Gem::Requirement
|
17
17
|
none: false
|
18
18
|
requirements:
|
19
19
|
- - ~>
|
@@ -21,7 +21,7 @@ dependencies:
|
|
21
21
|
version: 2.5.0
|
22
22
|
type: :runtime
|
23
23
|
prerelease: false
|
24
|
-
version_requirements: *
|
24
|
+
version_requirements: *70223227850320
|
25
25
|
description: Simple little website crawler.
|
26
26
|
email:
|
27
27
|
- mike@urlgonomics.com
|