samao 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA1:
3
- metadata.gz: 561ab598e5442161a48b2878887296f8047aa269
4
- data.tar.gz: 76d3daa2ca3c1407286690404809ee2b58115764
3
+ metadata.gz: a483278b39eb8b7c7f68d9ccd02b9d8fa7bb5cec
4
+ data.tar.gz: 307181057f12f81fca2a408a85dfa9de6fdedeaf
5
5
  SHA512:
6
- metadata.gz: acab6baf781e25d75cb34d7214ac78cdb79936cca735fa554f152342fd285ed562e30c2ac5983a0c21cd6b3154dacd1830871637fd08be89dfbde109569591ca
7
- data.tar.gz: 5a2f6e9d1a9ee336fe2e4234ec8d6336f00748cb5d39a06932987a557ec342c7f223471700502a64d3b02296920e91f3c2484078b23432b189ec2b09fb529abc
6
+ metadata.gz: e4608bbab36feb41b82f5779a7c2be74c45349b349b2243cbe30ccf4666c83a95e1407b7439a36c9f4ce0fb3eabc5bd0d2a9cca5b0c6529bde85355ae84afea0
7
+ data.tar.gz: f181681e70dd14f9607f9a8002e2656522b6f4cb483aa3aa18153f6c2d5bc74619436997e3e1eac5a7d2e3558dfb66fae355bd2c3e73b80a6f39cb40ef8b38c7
data/README.md CHANGED
@@ -29,28 +29,32 @@ Or install it yourself as:
29
29
  samao = Samao::Detector.new
30
30
 
31
31
  # set base url and start page
32
- samao.base_url 'https://github.com'
32
+ samao.baseurl 'https://github.com'
33
33
  samao.from '/Lax?tab=repositories'
34
34
  # the following line have the same effect
35
35
  #samao.from 'https://github.com/Lax?tab=repositories'
36
36
 
37
37
  # tell samao how to find the next page
38
- samao.match :next, 'div.pagination a.next_page'
38
+ samao.find :next, 'div.pagination a.next_page'
39
+ samao.max_page 1
39
40
 
40
41
  # tell samao how to find items.
41
42
  # further more, set the data from matched HTML node/element.
42
- samao.add_item 'div#user-repositories-list li a[itemprop="name codeRepository"]' do |item|
43
+ samao.find_item 'div#user-repositories-list li a[itemprop="name codeRepository"]' do |item|
43
44
  item.set_url :url, item.raw(:item)['href']
44
45
  item.set :title, item.raw(:item).text.strip
45
46
  end
46
47
 
48
+ samao.find_item 'div#user-repositories-list li' do |item|
49
+ item.find(:url, 'a[itemprop="name codeRepository"]') {|value| [:set_url, :url, value.first['href']] }
50
+ item.find(:title, 'a[itemprop="name codeRepository"]') {|value| [:set, value.first.text.strip] }
51
+ end
52
+
47
53
  # if it need to open content page for more information
48
54
  # default key is :url
49
55
  samao.add_detail :url do |detail|
50
56
  #samao.add_detail do |detail|
51
- detail.match :author, 'h1.public .author a' do |item|
52
- item.set :author, item.raw(:author).first.text.strip
53
- end
57
+ detail.find(:author, 'h1.public .author a') {|value| value.first.text.strip }
54
58
  end
55
59
 
56
60
  # run the detector
data/lib/samao/catcher.rb CHANGED
@@ -10,7 +10,8 @@ module Samao
10
10
  @code = 0
11
11
 
12
12
  @headers = {}
13
- @headers["Referer"] = params[:base_url] if params[:base_url]
13
+ @headers["Referer"] = params[:baseurl].to_s if params[:baseurl]
14
+ @headers["User-Agent"] = "Samao/%s; Ruby/%s" % [Samao::VERSION, RUBY_VERSION]
14
15
 
15
16
  self
16
17
  end
data/lib/samao/detail.rb CHANGED
@@ -7,8 +7,8 @@ module Samao
7
7
 
8
8
  @item = params[:item]
9
9
  @url = params[:url]
10
- @base_url = params[:base_url]
11
- @catcher = Catcher.new(url:@url, base_url: @base_url)
10
+ @baseurl = params[:baseurl]
11
+ @catcher = Catcher.new(url:@url, baseurl: @baseurl)
12
12
 
13
13
  yield self if block_given?
14
14
 
@@ -18,8 +18,7 @@ module Samao
18
18
  def run
19
19
  if @catcher and @catcher.run.success? and doc = @catcher.doc
20
20
  @selector.each do |name, sel|
21
- @item.set_raw name, doc.css(sel)
22
- @on[name].call @item if @on[name]
21
+ found(name, doc.css(sel), @item)
23
22
  end
24
23
  end
25
24
 
@@ -5,10 +5,16 @@ module Samao
5
5
  def initialize(params={})
6
6
  matchable
7
7
 
8
- @current_url = @base_url = @from = nil
8
+ @current_url = @baseurl = @from = @max_page = nil
9
9
  @pages = []
10
10
  @items = []
11
11
 
12
+ @queue_of_items = Queue.new
13
+
14
+ @semaphore = Queue.new
15
+ @max_concurrent = params[:max_concurrent] || 5
16
+ @max_concurrent.times { @semaphore.push(1) }
17
+
12
18
  yield self if block_given?
13
19
 
14
20
  self
@@ -16,31 +22,55 @@ module Samao
16
22
 
17
23
  # return Detector self
18
24
  def run
19
- while @from and @from.run.success? and @current_doc = @from.doc
25
+ threads = []
26
+ while @from
27
+ break unless @from.run.success?
28
+ @current_doc = @from.doc
29
+
20
30
  # find items in current_page
21
31
  if found = @current_doc.css(@selector[:item]) and found.size >= 1
22
- @items += found.map do |raw_item|
23
- item = Item.new(base_url: @current_url, raw_item:raw_item) do |item|
24
- @on[:item].call(item) if @on[:item]
25
- end.run
26
-
27
- if @detail_key
28
- detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
29
- @on[:detail].call(detail) if @on[:detail]
30
- end.run
31
- end
32
-
33
- item.prop
34
- end
35
- end
32
+ found.each do |raw_item|
33
+ threads << Thread.new do
34
+ @semaphore.pop
35
+ # puts "#{Time.now} #{@semaphore.size} available tokens. #{@semaphore.num_waiting} threads waiting."
36
+
37
+ begin
38
+ item = Item.new(baseurl: @current_url, raw_item:raw_item) do |item|
39
+ @on[:item].call(item) if @on[:item]
40
+ end.run
41
+
42
+ if @detail_key
43
+ detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
44
+ @on[:detail].call(detail) if @on[:detail]
45
+ end.run
46
+ end
47
+
48
+ @queue_of_items.push item.prop
49
+ rescue => e
50
+ p e
51
+ ensure
52
+ @semaphore.push(1)
53
+ end
54
+ end # end Thread
55
+ end # end found.each loop
56
+ end # end if found
36
57
 
37
58
  # find next page[s] in current page
38
- if @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
59
+ if @max_page and @pages.size >= @max_page
60
+ stop
61
+ elsif @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
39
62
  @on[:next].call(next_url) if @on[:next]
40
63
  from next_url
41
64
  else
42
65
  stop
43
66
  end
67
+ end # end while @from
68
+
69
+ threads.each(&:join)
70
+
71
+ threads.size.times do
72
+ item = @queue_of_items.pop
73
+ @items << item
44
74
  end
45
75
 
46
76
  self
@@ -51,18 +81,18 @@ module Samao
51
81
  @on[:detail] = block if block
52
82
  end
53
83
 
54
- def add_item(selector, &block)
55
- match(:item, selector, &block)
84
+ def find_item(selector, &block)
85
+ find(:item, selector, &block)
56
86
  end
57
87
 
58
88
  # set front page
59
89
  def from(url)
60
- if prev_url = @current_url || @base_url
90
+ if prev_url = @current_url || @baseurl
61
91
  url = URI.join(prev_url, url)
62
92
  end
63
93
  url = URI(url) if ! url.is_a? URI
64
94
 
65
- @from = Catcher.new(url:url, base_url:@current_url)
95
+ @from = Catcher.new(url:url, baseurl:@current_url)
66
96
  @pages << url
67
97
  @current_url = url
68
98
 
@@ -70,8 +100,22 @@ module Samao
70
100
  end
71
101
 
72
102
  # set base url
73
- def base_url(url)
74
- @base_url = url
103
+ def baseurl(url)
104
+ @baseurl = url
105
+
106
+ self
107
+ end
108
+
109
+ # set max page
110
+ def max_page(max)
111
+ @max_page = max
112
+
113
+ self
114
+ end
115
+
116
+ # set max page
117
+ def max_page(max)
118
+ @max_page = max
75
119
 
76
120
  self
77
121
  end
data/lib/samao/item.rb CHANGED
@@ -8,7 +8,8 @@ module Samao
8
8
  @prop = {} # usefull properties
9
9
  @raw = {} # nodes go here.
10
10
 
11
- @base_url = params[:base_url]
11
+ @baseurl = params[:baseurl].to_s
12
+
12
13
  set_raw :item, params[:raw_item] if params[:raw_item]
13
14
 
14
15
  yield self if block_given?
@@ -18,8 +19,7 @@ module Samao
18
19
 
19
20
  def extract
20
21
  @selector.each do |name, sel|
21
- set_raw name, @raw[:item].css(sel)
22
- @on[name].call self if @on[name]
22
+ found(name, @raw[:item].css(sel))
23
23
  end
24
24
 
25
25
  self
@@ -31,10 +31,14 @@ module Samao
31
31
  end
32
32
 
33
33
  def set_url(name, value)
34
- value = URI.join @base_url, value if @base_url
34
+ value = URI.join @baseurl, value if @baseurl
35
35
  set(name, value.to_s)
36
36
  end
37
37
 
38
+ def set_baseurl(baseurl=nil)
39
+ @prop[:baseurl] = baseurl || @baseurl
40
+ end
41
+
38
42
  def prop(name=nil)
39
43
  if name
40
44
  return @prop[name]
@@ -5,17 +5,38 @@ module Samao
5
5
  end
6
6
 
7
7
  def matchable
8
+ @cmd_sets = [:set, :set_url] ## target class should inplements methods as: set(name, value)
8
9
  @selector = {}
9
10
  @on = {}
10
11
  end
11
12
 
12
- def match(name, selector, &block)
13
+ def find(name, selector, &block)
13
14
  @selector[name] = selector
14
15
 
15
16
  @on[name] = block if block
16
17
 
17
18
  self
18
19
  end
20
+ alias match find
21
+
22
+ def found(name, value, target=self)
23
+ cmd = :set
24
+
25
+ if @on[name]
26
+ value = @on[name].call value
27
+ if value.is_a? Array and @cmd_sets.include?(value[0].to_sym)
28
+ case value.length
29
+ when 2
30
+ cmd, value = value
31
+ when 3
32
+ cmd, name, value = value
33
+ end
34
+ end
35
+ target.send cmd, name, value
36
+ end
37
+
38
+ target.send cmd, name, value
39
+ end
19
40
 
20
41
  end
21
42
  end
data/lib/samao/version.rb CHANGED
@@ -1,3 +1,3 @@
1
1
  module Samao
2
- VERSION = "0.1.0"
2
+ VERSION = "0.1.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: samao
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.1.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Liu Lantao
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2016-12-18 00:00:00.000000000 Z
11
+ date: 2016-12-20 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: nokogiri