arb-bs 0.1.6 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: cc17e9bd17a4771edc4696a6e8a052da7edd1484
4
- data.tar.gz: 8038b6face47fd2f01172dbb0577d0e95c551dbf
3
+ metadata.gz: d3fee1fcde122cff7050d84adeee0089b80a7b59
4
+ data.tar.gz: f54a5e8778a5718c7c88fb9b4f336bbf0ec1f423
5
5
  SHA512:
6
- metadata.gz: 57b740f6b723bef1202f1180e6209bcc5093f682539b1d053638c5d11e4180f115142d3e49681adb0707597a75abcc3c137117ee98dedced0498eba19793e1bd
7
- data.tar.gz: 20550beb04745265ad363d5f75c66a25adcfec0b2de4c1625302187fe8b8e0fe466a1a890a1ad535b840d9cdcf048cdd0e5031ba65e418ebf1350a2e85964ae9
6
+ metadata.gz: a915ebb6fe64cf5579683cf1b32082cae589758534b423b9cfce73ef9c70c18f715d442f62283655497dfa9b2c404c88bf57269fdac44c9dd8c8ad0957ad434e
7
+ data.tar.gz: 36f3da886a3eb1cff62adbd87aee7def07da9e37a8b62c8f3005ba6023a4444dc0d92ed3b049e4bd442108ed4f149e09051356e9fce551f11473c011cb24bbac
data/arb-bs.gemspec CHANGED
@@ -23,5 +23,6 @@ Gem::Specification.new do |spec|
23
23
  spec.add_development_dependency "bundler", "~> 1.14"
24
24
  spec.add_development_dependency "rake", "~> 10.0"
25
25
  spec.add_dependency "arb-crawler"
26
+ spec.add_dependency "arb-thread"
26
27
 
27
28
  end
data/exe/bs_pic CHANGED
@@ -1,5 +1,6 @@
1
1
  #!/usr/bin/env ruby
2
2
 
3
+ require 'arb/thread'
3
4
  require 'arb/crawler'
4
5
 
5
6
  include Arb
@@ -9,31 +10,37 @@ map_file='map.txt'
9
10
  max_page=(ARGV[0] || 50).to_i
10
11
  #Minimun idle time(in seconds) between two complete rounds.
11
12
  min_idle_time=(ARGV[1] || 600).to_i
13
+ thread_count=(ARGV[2] || 3).to_i
12
14
 
13
15
 
14
16
  File.open(map_file,'w+') unless File.exists? map_file
15
17
 
16
- loop do
17
- "http://#{domain}/pic/?".enum('?',1..max_page).each_with_index do |url,index|
18
- res=Crawler.get_by_css(url,"div.j-r-list-c-img a img")
19
- unless res
20
- puts "Some errors occur when parsing page #{index+1}."
21
- next
22
- end
23
- res.each do |hash|
24
- url_file=Crawler.filename_of_url(hash[:"data-original"])
25
- unless File.readlines(map_file).find{|line| line.to_s.include? url_file}
26
- if Crawler.download(hash[:"data-original"],url_file)
27
- puts "#{hash[:'data-original']}\n#{hash[:title]}",''
28
- File.open map_file,'a' do |file|
29
- file.puts "#{url_file}:#{Crawler.filter_str(hash[:title])}"
18
+ Thread.parallel(thread_count) do |dispatcher|
19
+
20
+ loop do
21
+ "http://#{domain}/pic/?".enum('?',1..max_page).each_with_index do |url,index|
22
+ dispatcher.new_task do
23
+ res=Crawler.get_by_css(url,"div.j-r-list-c-img a img")
24
+ unless res
25
+ puts "Some errors occur when parsing page #{index+1}."
26
+ next
27
+ end
28
+ res.each do |hash|
29
+ url_file=Crawler.filename_of_url(hash[:"data-original"])
30
+ unless File.readlines(map_file).find{|line| line.to_s.include? url_file}
31
+ if Crawler.download(hash[:"data-original"],url_file)
32
+ puts "#{hash[:'data-original']}\n#{hash[:title]}",''
33
+ File.open map_file,'a' do |file|
34
+ file.puts "#{url_file}:#{Crawler.filter_str(hash[:title])}"
35
+ end
36
+ end
30
37
  end
31
38
  end
39
+ tmp=1+rand(5)
40
+ puts "Page round finished for page #{index+1}, next action in #{tmp} seconds later."
41
+ sleep tmp
32
42
  end
33
43
  end
34
- tmp=1+rand(5)
35
- puts "Page round finished for page #{index+1}, next action in #{tmp} seconds later."
36
- sleep tmp
37
44
  end
38
45
  tmp=min_idle_time+rand(5)
39
46
  puts "Complete round finished, next action in #{tmp} seconds later."
@@ -1,5 +1,5 @@
1
1
  module Arb
2
2
  module Bs
3
- VERSION = "0.1.6"
3
+ VERSION = "1.0.0"
4
4
  end
5
5
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: arb-bs
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.6
4
+ version: 1.0.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - arybin
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2017-02-18 00:00:00.000000000 Z
11
+ date: 2018-04-05 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: bundler
@@ -52,6 +52,20 @@ dependencies:
52
52
  - - ">="
53
53
  - !ruby/object:Gem::Version
54
54
  version: '0'
55
+ - !ruby/object:Gem::Dependency
56
+ name: arb-thread
57
+ requirement: !ruby/object:Gem::Requirement
58
+ requirements:
59
+ - - ">="
60
+ - !ruby/object:Gem::Version
61
+ version: '0'
62
+ type: :runtime
63
+ prerelease: false
64
+ version_requirements: !ruby/object:Gem::Requirement
65
+ requirements:
66
+ - - ">="
67
+ - !ruby/object:Gem::Version
68
+ version: '0'
55
69
  description: A demo of Web Crawler using arb-crawler
56
70
  email:
57
71
  - arybin@163.com
@@ -89,7 +103,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
89
103
  version: '0'
90
104
  requirements: []
91
105
  rubyforge_project:
92
- rubygems_version: 2.4.8
106
+ rubygems_version: 2.6.14
93
107
  signing_key:
94
108
  specification_version: 4
95
109
  summary: A demo of Web Crawler using arb-crawler