tjcrawler 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: d862225dcccd8804520d59ed284c9e536eaed0c8
4
- data.tar.gz: 5ce47b642a5bae952139f8ad53adb00d4f697e49
3
+ metadata.gz: 4b2e68041b0126fe587a4348cb7b5586231d025b
4
+ data.tar.gz: 827c4d9d45cdb1c4fd33cabacdc5c10af36def81
5
5
  SHA512:
6
- metadata.gz: b376fdb73d63fe698ddd1dbb787ab3e5973a58fdc1e5c4514090c0785e3cf594dbbb3398d8b26dd555f255e899a70b1c3406aa741fe1af7273dae1913f8265e3
7
- data.tar.gz: da0abd28dfb2245fd5b9a54d3e3094d4db1c8add13385eed00b13f15580d877a0ff22fd41e49c04e6c8361dead88f23edfd44d3c60e1592cad9bfb63f549c7ff
6
+ metadata.gz: 0be2fdd12d3a5079e6979d30f200135d1eff5c193541db58e71b8a289e2d334d9b49eaad3070d0a3df4c38d20a5049d9c7ea99b062ed6d0be7d492732e5aa3c6
7
+ data.tar.gz: f9d0fa93536ebac50b13dad8988770b7bb1d97b12170d29c8ed53461c33d35aa5dde4753a4b58f379a7e44b735fc9773e482cf235e484984b6c75511985b5a6c
@@ -22,7 +22,8 @@ module Tjcrawler
22
22
  sleep 1 until page = Page.dequeue
23
23
  print :'.'
24
24
  result = crawl page.url
25
- page.touch(:crawled_at) if page.update(content: result.content)
25
+ page.update(content: result.content)
26
+ page.touch(:crawled_at)
26
27
  result.links.each{ |url| Page.enqueue url }
27
28
  end
28
29
  end
@@ -1,10 +1,9 @@
1
1
  require 'nokogiri'
2
2
  require 'tjcrawler/page'
3
-
4
3
  module Tjcrawler
5
4
  class Parser
6
- # a nokogiri doc will be yield in block, return true/flase
7
- # for successful/failed parsing.
5
+ @@semaphore = Mutex.new
6
+ # a nokogiri doc will be yield in block
8
7
  def initialize &block
9
8
  yield 'Block required' unless block_given?
10
9
  @strategy = block
@@ -12,7 +11,8 @@ module Tjcrawler
12
11
 
13
12
  def parse content
14
13
  doc = Nokogiri::HTML(content)
15
- ret = @strategy[doc]
14
+ ret = nil
15
+ @@semaphore.synchronize{ ret = @strategy[doc] }
16
16
  print :'.'
17
17
  ret
18
18
  end
@@ -20,7 +20,8 @@ module Tjcrawler
20
20
  def start
21
21
  loop do
22
22
  sleep 1 until page = find_next
23
- page.touch(:parsed_at) if parse(page.content)
23
+ parse(page.content)
24
+ page.touch(:parsed_at)
24
25
  end
25
26
  end
26
27
 
@@ -1,3 +1,3 @@
1
1
  module Tjcrawler
2
- VERSION = "0.0.1"
2
+ VERSION = "0.0.2"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tjcrawler
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.0.1
4
+ version: 0.0.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Jian Weihang