tjcrawler 0.0.1 → 0.0.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d862225dcccd8804520d59ed284c9e536eaed0c8
4
- data.tar.gz: 5ce47b642a5bae952139f8ad53adb00d4f697e49
3
+ metadata.gz: 4b2e68041b0126fe587a4348cb7b5586231d025b
4
+ data.tar.gz: 827c4d9d45cdb1c4fd33cabacdc5c10af36def81
5
5
  SHA512:
6
- metadata.gz: b376fdb73d63fe698ddd1dbb787ab3e5973a58fdc1e5c4514090c0785e3cf594dbbb3398d8b26dd555f255e899a70b1c3406aa741fe1af7273dae1913f8265e3
7
- data.tar.gz: da0abd28dfb2245fd5b9a54d3e3094d4db1c8add13385eed00b13f15580d877a0ff22fd41e49c04e6c8361dead88f23edfd44d3c60e1592cad9bfb63f549c7ff
6
+ metadata.gz: 0be2fdd12d3a5079e6979d30f200135d1eff5c193541db58e71b8a289e2d334d9b49eaad3070d0a3df4c38d20a5049d9c7ea99b062ed6d0be7d492732e5aa3c6
7
+ data.tar.gz: f9d0fa93536ebac50b13dad8988770b7bb1d97b12170d29c8ed53461c33d35aa5dde4753a4b58f379a7e44b735fc9773e482cf235e484984b6c75511985b5a6c
@@ -22,7 +22,8 @@ module Tjcrawler
22
22
  sleep 1 until page = Page.dequeue
23
23
  print :'.'
24
24
  result = crawl page.url
25
- page.touch(:crawled_at) if page.update(content: result.content)
25
+ page.update(content: result.content)
26
+ page.touch(:crawled_at)
26
27
  result.links.each{ |url| Page.enqueue url }
27
28
  end
28
29
  end
@@ -1,10 +1,9 @@
1
1
  require 'nokogiri'
2
2
  require 'tjcrawler/page'
3
-
4
3
  module Tjcrawler
5
4
  class Parser
6
- # a nokogiri doc will be yield in block, return true/flase
7
- # for successful/failed parsing.
5
+ @@semaphore = Mutex.new
6
+ # a nokogiri doc will be yield in block
8
7
  def initialize &block
9
8
  yield 'Block required' unless block_given?
10
9
  @strategy = block
@@ -12,7 +11,8 @@ module Tjcrawler
12
11
 
13
12
  def parse content
14
13
  doc = Nokogiri::HTML(content)
15
- ret = @strategy[doc]
14
+ ret = nil
15
+ @@semaphore.synchronize{ ret = @strategy[doc] }
16
16
  print :'.'
17
17
  ret
18
18
  end
@@ -20,7 +20,8 @@ module Tjcrawler
20
20
  def start
21
21
  loop do
22
22
  sleep 1 until page = find_next
23
- page.touch(:parsed_at) if parse(page.content)
23
+ parse(page.content)
24
+ page.touch(:parsed_at)
24
25
  end
25
26
  end
26
27
 
@@ -1,3 +1,3 @@
1
1
  module Tjcrawler
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tjcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang