tjcrawler 0.0.1 → 0.0.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/lib/tjcrawler/crawler.rb +2 -1
- data/lib/tjcrawler/parser.rb +6 -5
- data/lib/tjcrawler/version.rb +1 -1
- metadata +1 -1
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 4b2e68041b0126fe587a4348cb7b5586231d025b
|
4
|
+
data.tar.gz: 827c4d9d45cdb1c4fd33cabacdc5c10af36def81
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 0be2fdd12d3a5079e6979d30f200135d1eff5c193541db58e71b8a289e2d334d9b49eaad3070d0a3df4c38d20a5049d9c7ea99b062ed6d0be7d492732e5aa3c6
|
7
|
+
data.tar.gz: f9d0fa93536ebac50b13dad8988770b7bb1d97b12170d29c8ed53461c33d35aa5dde4753a4b58f379a7e44b735fc9773e482cf235e484984b6c75511985b5a6c
|
data/lib/tjcrawler/crawler.rb
CHANGED
@@ -22,7 +22,8 @@ module Tjcrawler
|
|
22
22
|
sleep 1 until page = Page.dequeue
|
23
23
|
print :'.'
|
24
24
|
result = crawl page.url
|
25
|
-
page.
|
25
|
+
page.update(content: result.content)
|
26
|
+
page.touch(:crawled_at)
|
26
27
|
result.links.each{ |url| Page.enqueue url }
|
27
28
|
end
|
28
29
|
end
|
data/lib/tjcrawler/parser.rb
CHANGED
@@ -1,10 +1,9 @@
|
|
1
1
|
require 'nokogiri'
|
2
2
|
require 'tjcrawler/page'
|
3
|
-
|
4
3
|
module Tjcrawler
|
5
4
|
class Parser
|
6
|
-
|
7
|
-
#
|
5
|
+
@@semaphore = Mutex.new
|
6
|
+
# a nokogiri doc will be yield in block
|
8
7
|
def initialize &block
|
9
8
|
yield 'Block required' unless block_given?
|
10
9
|
@strategy = block
|
@@ -12,7 +11,8 @@ module Tjcrawler
|
|
12
11
|
|
13
12
|
def parse content
|
14
13
|
doc = Nokogiri::HTML(content)
|
15
|
-
ret =
|
14
|
+
ret = nil
|
15
|
+
@@semaphore.synchronize{ ret = @strategy[doc] }
|
16
16
|
print :'.'
|
17
17
|
ret
|
18
18
|
end
|
@@ -20,7 +20,8 @@ module Tjcrawler
|
|
20
20
|
def start
|
21
21
|
loop do
|
22
22
|
sleep 1 until page = find_next
|
23
|
-
|
23
|
+
parse(page.content)
|
24
|
+
page.touch(:parsed_at)
|
24
25
|
end
|
25
26
|
end
|
26
27
|
|
data/lib/tjcrawler/version.rb
CHANGED