samao 0.1.0 → 0.1.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 561ab598e5442161a48b2878887296f8047aa269
4
- data.tar.gz: 76d3daa2ca3c1407286690404809ee2b58115764
3
+ metadata.gz: a483278b39eb8b7c7f68d9ccd02b9d8fa7bb5cec
4
+ data.tar.gz: 307181057f12f81fca2a408a85dfa9de6fdedeaf
5
5
  SHA512:
6
- metadata.gz: acab6baf781e25d75cb34d7214ac78cdb79936cca735fa554f152342fd285ed562e30c2ac5983a0c21cd6b3154dacd1830871637fd08be89dfbde109569591ca
7
- data.tar.gz: 5a2f6e9d1a9ee336fe2e4234ec8d6336f00748cb5d39a06932987a557ec342c7f223471700502a64d3b02296920e91f3c2484078b23432b189ec2b09fb529abc
6
+ metadata.gz: e4608bbab36feb41b82f5779a7c2be74c45349b349b2243cbe30ccf4666c83a95e1407b7439a36c9f4ce0fb3eabc5bd0d2a9cca5b0c6529bde85355ae84afea0
7
+ data.tar.gz: f181681e70dd14f9607f9a8002e2656522b6f4cb483aa3aa18153f6c2d5bc74619436997e3e1eac5a7d2e3558dfb66fae355bd2c3e73b80a6f39cb40ef8b38c7
data/README.md CHANGED
@@ -29,28 +29,32 @@ Or install it yourself as:
29
29
  samao = Samao::Detector.new
30
30
 
31
31
  # set base url and start page
32
- samao.base_url 'https://github.com'
32
+ samao.baseurl 'https://github.com'
33
33
  samao.from '/Lax?tab=repositories'
34
34
  # the following line have the same effect
35
35
  #samao.from 'https://github.com/Lax?tab=repositories'
36
36
 
37
37
  # tell samao how to find the next page
38
- samao.match :next, 'div.pagination a.next_page'
38
+ samao.find :next, 'div.pagination a.next_page'
39
+ samao.max_page 1
39
40
 
40
41
  # tell samao how to find items.
41
42
  # further more, set the data from matched HTML node/element.
42
- samao.add_item 'div#user-repositories-list li a[itemprop="name codeRepository"]' do |item|
43
+ samao.find_item 'div#user-repositories-list li a[itemprop="name codeRepository"]' do |item|
43
44
  item.set_url :url, item.raw(:item)['href']
44
45
  item.set :title, item.raw(:item).text.strip
45
46
  end
46
47
 
48
+ samao.find_item 'div#user-repositories-list li' do |item|
49
+ item.find(:url, 'a[itemprop="name codeRepository"]') {|value| [:set_url, :url, value.first['href']] }
50
+ item.find(:title, 'a[itemprop="name codeRepository"]') {|value| [:set, value.first.text.strip] }
51
+ end
52
+
47
53
  # if it need to open content page for more information
48
54
  # default key is :url
49
55
  samao.add_detail :url do |detail|
50
56
  #samao.add_detail do |detail|
51
- detail.match :author, 'h1.public .author a' do |item|
52
- item.set :author, item.raw(:author).first.text.strip
53
- end
57
+ detail.find(:author, 'h1.public .author a') {|value| value.first.text.strip }
54
58
  end
55
59
 
56
60
  # run the detector
data/lib/samao/catcher.rb CHANGED
@@ -10,7 +10,8 @@ module Samao
10
10
  @code = 0
11
11
 
12
12
  @headers = {}
13
- @headers["Referer"] = params[:base_url] if params[:base_url]
13
+ @headers["Referer"] = params[:baseurl].to_s if params[:baseurl]
14
+ @headers["User-Agent"] = "Samao/%s; Ruby/%s" % [Samao::VERSION, RUBY_VERSION]
14
15
 
15
16
  self
16
17
  end
data/lib/samao/detail.rb CHANGED
@@ -7,8 +7,8 @@ module Samao
7
7
 
8
8
  @item = params[:item]
9
9
  @url = params[:url]
10
- @base_url = params[:base_url]
11
- @catcher = Catcher.new(url:@url, base_url: @base_url)
10
+ @baseurl = params[:baseurl]
11
+ @catcher = Catcher.new(url:@url, baseurl: @baseurl)
12
12
 
13
13
  yield self if block_given?
14
14
 
@@ -18,8 +18,7 @@ module Samao
18
18
  def run
19
19
  if @catcher and @catcher.run.success? and doc = @catcher.doc
20
20
  @selector.each do |name, sel|
21
- @item.set_raw name, doc.css(sel)
22
- @on[name].call @item if @on[name]
21
+ found(name, doc.css(sel), @item)
23
22
  end
24
23
  end
25
24
 
@@ -5,10 +5,16 @@ module Samao
5
5
  def initialize(params={})
6
6
  matchable
7
7
 
8
- @current_url = @base_url = @from = nil
8
+ @current_url = @baseurl = @from = @max_page = nil
9
9
  @pages = []
10
10
  @items = []
11
11
 
12
+ @queue_of_items = Queue.new
13
+
14
+ @semaphore = Queue.new
15
+ @max_concurrent = params[:max_concurrent] || 5
16
+ @max_concurrent.times { @semaphore.push(1) }
17
+
12
18
  yield self if block_given?
13
19
 
14
20
  self
@@ -16,31 +22,55 @@ module Samao
16
22
 
17
23
  # return Detector self
18
24
  def run
19
- while @from and @from.run.success? and @current_doc = @from.doc
25
+ threads = []
26
+ while @from
27
+ break unless @from.run.success?
28
+ @current_doc = @from.doc
29
+
20
30
  # find items in current_page
21
31
  if found = @current_doc.css(@selector[:item]) and found.size >= 1
22
- @items += found.map do |raw_item|
23
- item = Item.new(base_url: @current_url, raw_item:raw_item) do |item|
24
- @on[:item].call(item) if @on[:item]
25
- end.run
26
-
27
- if @detail_key
28
- detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
29
- @on[:detail].call(detail) if @on[:detail]
30
- end.run
31
- end
32
-
33
- item.prop
34
- end
35
- end
32
+ found.each do |raw_item|
33
+ threads << Thread.new do
34
+ @semaphore.pop
35
+ # puts "#{Time.now} #{@semaphore.size} available tokens. #{@semaphore.num_waiting} threads waiting."
36
+
37
+ begin
38
+ item = Item.new(baseurl: @current_url, raw_item:raw_item) do |item|
39
+ @on[:item].call(item) if @on[:item]
40
+ end.run
41
+
42
+ if @detail_key
43
+ detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
44
+ @on[:detail].call(detail) if @on[:detail]
45
+ end.run
46
+ end
47
+
48
+ @queue_of_items.push item.prop
49
+ rescue => e
50
+ p e
51
+ ensure
52
+ @semaphore.push(1)
53
+ end
54
+ end # end Thread
55
+ end # end found.each loop
56
+ end # end if found
36
57
 
37
58
  # find next page[s] in current page
38
- if @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
59
+ if @max_page and @pages.size >= @max_page
60
+ stop
61
+ elsif @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
39
62
  @on[:next].call(next_url) if @on[:next]
40
63
  from next_url
41
64
  else
42
65
  stop
43
66
  end
67
+ end # end while @from
68
+
69
+ threads.each(&:join)
70
+
71
+ threads.size.times do
72
+ item = @queue_of_items.pop
73
+ @items << item
44
74
  end
45
75
 
46
76
  self
@@ -51,18 +81,18 @@ module Samao
51
81
  @on[:detail] = block if block
52
82
  end
53
83
 
54
- def add_item(selector, &block)
55
- match(:item, selector, &block)
84
+ def find_item(selector, &block)
85
+ find(:item, selector, &block)
56
86
  end
57
87
 
58
88
  # set front page
59
89
  def from(url)
60
- if prev_url = @current_url || @base_url
90
+ if prev_url = @current_url || @baseurl
61
91
  url = URI.join(prev_url, url)
62
92
  end
63
93
  url = URI(url) if ! url.is_a? URI
64
94
 
65
- @from = Catcher.new(url:url, base_url:@current_url)
95
+ @from = Catcher.new(url:url, baseurl:@current_url)
66
96
  @pages << url
67
97
  @current_url = url
68
98
 
@@ -70,8 +100,22 @@ module Samao
70
100
  end
71
101
 
72
102
  # set base url
73
- def base_url(url)
74
- @base_url = url
103
+ def baseurl(url)
104
+ @baseurl = url
105
+
106
+ self
107
+ end
108
+
109
+ # set max page
110
+ def max_page(max)
111
+ @max_page = max
112
+
113
+ self
114
+ end
115
+
116
+ # set max page
117
+ def max_page(max)
118
+ @max_page = max
75
119
 
76
120
  self
77
121
  end
data/lib/samao/item.rb CHANGED
@@ -8,7 +8,8 @@ module Samao
8
8
  @prop = {} # usefull properties
9
9
  @raw = {} # nodes go here.
10
10
 
11
- @base_url = params[:base_url]
11
+ @baseurl = params[:baseurl].to_s
12
+
12
13
  set_raw :item, params[:raw_item] if params[:raw_item]
13
14
 
14
15
  yield self if block_given?
@@ -18,8 +19,7 @@ module Samao
18
19
 
19
20
  def extract
20
21
  @selector.each do |name, sel|
21
- set_raw name, @raw[:item].css(sel)
22
- @on[name].call self if @on[name]
22
+ found(name, @raw[:item].css(sel))
23
23
  end
24
24
 
25
25
  self
@@ -31,10 +31,14 @@ module Samao
31
31
  end
32
32
 
33
33
  def set_url(name, value)
34
- value = URI.join @base_url, value if @base_url
34
+ value = URI.join @baseurl, value if @baseurl
35
35
  set(name, value.to_s)
36
36
  end
37
37
 
38
+ def set_baseurl(baseurl=nil)
39
+ @prop[:baseurl] = baseurl || @baseurl
40
+ end
41
+
38
42
  def prop(name=nil)
39
43
  if name
40
44
  return @prop[name]
@@ -5,17 +5,38 @@ module Samao
5
5
  end
6
6
 
7
7
  def matchable
8
+ @cmd_sets = [:set, :set_url] ## target class should inplements methods as: set(name, value)
8
9
  @selector = {}
9
10
  @on = {}
10
11
  end
11
12
 
12
- def match(name, selector, &block)
13
+ def find(name, selector, &block)
13
14
  @selector[name] = selector
14
15
 
15
16
  @on[name] = block if block
16
17
 
17
18
  self
18
19
  end
20
+ alias match find
21
+
22
+ def found(name, value, target=self)
23
+ cmd = :set
24
+
25
+ if @on[name]
26
+ value = @on[name].call value
27
+ if value.is_a? Array and @cmd_sets.include?(value[0].to_sym)
28
+ case value.length
29
+ when 2
30
+ cmd, value = value
31
+ when 3
32
+ cmd, name, value = value
33
+ end
34
+ end
35
+ target.send cmd, name, value
36
+ end
37
+
38
+ target.send cmd, name, value
39
+ end
19
40
 
20
41
  end
21
42
  end
data/lib/samao/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Samao
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: samao
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Liu Lantao
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-12-18 00:00:00.000000000 Z
11
+ date: 2016-12-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri