samao 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/README.md +10 -6
- data/lib/samao/catcher.rb +2 -1
- data/lib/samao/detail.rb +3 -4
- data/lib/samao/detector.rb +67 -23
- data/lib/samao/item.rb +8 -4
- data/lib/samao/matchable.rb +22 -1
- data/lib/samao/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a483278b39eb8b7c7f68d9ccd02b9d8fa7bb5cec
|
4
|
+
data.tar.gz: 307181057f12f81fca2a408a85dfa9de6fdedeaf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e4608bbab36feb41b82f5779a7c2be74c45349b349b2243cbe30ccf4666c83a95e1407b7439a36c9f4ce0fb3eabc5bd0d2a9cca5b0c6529bde85355ae84afea0
|
7
|
+
data.tar.gz: f181681e70dd14f9607f9a8002e2656522b6f4cb483aa3aa18153f6c2d5bc74619436997e3e1eac5a7d2e3558dfb66fae355bd2c3e73b80a6f39cb40ef8b38c7
|
data/README.md
CHANGED
@@ -29,28 +29,32 @@ Or install it yourself as:
|
|
29
29
|
samao = Samao::Detector.new
|
30
30
|
|
31
31
|
# set base url and start page
|
32
|
-
samao.
|
32
|
+
samao.baseurl 'https://github.com'
|
33
33
|
samao.from '/Lax?tab=repositories'
|
34
34
|
# the following line have the same effect
|
35
35
|
#samao.from 'https://github.com/Lax?tab=repositories'
|
36
36
|
|
37
37
|
# tell samao how to find the next page
|
38
|
-
samao.
|
38
|
+
samao.find :next, 'div.pagination a.next_page'
|
39
|
+
samao.max_page 1
|
39
40
|
|
40
41
|
# tell samao how to find items.
|
41
42
|
# further more, set the data from matched HTML node/element.
|
42
|
-
samao.
|
43
|
+
samao.find_item 'div#user-repositories-list li a[itemprop="name codeRepository"]' do |item|
|
43
44
|
item.set_url :url, item.raw(:item)['href']
|
44
45
|
item.set :title, item.raw(:item).text.strip
|
45
46
|
end
|
46
47
|
|
48
|
+
samao.find_item 'div#user-repositories-list li' do |item|
|
49
|
+
item.find(:url, 'a[itemprop="name codeRepository"]') {|value| [:set_url, :url, value.first['href']] }
|
50
|
+
item.find(:title, 'a[itemprop="name codeRepository"]') {|value| [:set, value.first.text.strip] }
|
51
|
+
end
|
52
|
+
|
47
53
|
# if it need to open content page for more information
|
48
54
|
# default key is :url
|
49
55
|
samao.add_detail :url do |detail|
|
50
56
|
#samao.add_detail do |detail|
|
51
|
-
detail.
|
52
|
-
item.set :author, item.raw(:author).first.text.strip
|
53
|
-
end
|
57
|
+
detail.find(:author, 'h1.public .author a') {|value| value.first.text.strip }
|
54
58
|
end
|
55
59
|
|
56
60
|
# run the detector
|
data/lib/samao/catcher.rb
CHANGED
@@ -10,7 +10,8 @@ module Samao
|
|
10
10
|
@code = 0
|
11
11
|
|
12
12
|
@headers = {}
|
13
|
-
@headers["Referer"] = params[:
|
13
|
+
@headers["Referer"] = params[:baseurl].to_s if params[:baseurl]
|
14
|
+
@headers["User-Agent"] = "Samao/%s; Ruby/%s" % [Samao::VERSION, RUBY_VERSION]
|
14
15
|
|
15
16
|
self
|
16
17
|
end
|
data/lib/samao/detail.rb
CHANGED
@@ -7,8 +7,8 @@ module Samao
|
|
7
7
|
|
8
8
|
@item = params[:item]
|
9
9
|
@url = params[:url]
|
10
|
-
@
|
11
|
-
@catcher = Catcher.new(url:@url,
|
10
|
+
@baseurl = params[:baseurl]
|
11
|
+
@catcher = Catcher.new(url:@url, baseurl: @baseurl)
|
12
12
|
|
13
13
|
yield self if block_given?
|
14
14
|
|
@@ -18,8 +18,7 @@ module Samao
|
|
18
18
|
def run
|
19
19
|
if @catcher and @catcher.run.success? and doc = @catcher.doc
|
20
20
|
@selector.each do |name, sel|
|
21
|
-
|
22
|
-
@on[name].call @item if @on[name]
|
21
|
+
found(name, doc.css(sel), @item)
|
23
22
|
end
|
24
23
|
end
|
25
24
|
|
data/lib/samao/detector.rb
CHANGED
@@ -5,10 +5,16 @@ module Samao
|
|
5
5
|
def initialize(params={})
|
6
6
|
matchable
|
7
7
|
|
8
|
-
@current_url = @
|
8
|
+
@current_url = @baseurl = @from = @max_page = nil
|
9
9
|
@pages = []
|
10
10
|
@items = []
|
11
11
|
|
12
|
+
@queue_of_items = Queue.new
|
13
|
+
|
14
|
+
@semaphore = Queue.new
|
15
|
+
@max_concurrent = params[:max_concurrent] || 5
|
16
|
+
@max_concurrent.times { @semaphore.push(1) }
|
17
|
+
|
12
18
|
yield self if block_given?
|
13
19
|
|
14
20
|
self
|
@@ -16,31 +22,55 @@ module Samao
|
|
16
22
|
|
17
23
|
# return Detector self
|
18
24
|
def run
|
19
|
-
|
25
|
+
threads = []
|
26
|
+
while @from
|
27
|
+
break unless @from.run.success?
|
28
|
+
@current_doc = @from.doc
|
29
|
+
|
20
30
|
# find items in current_page
|
21
31
|
if found = @current_doc.css(@selector[:item]) and found.size >= 1
|
22
|
-
|
23
|
-
|
24
|
-
@
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
32
|
+
found.each do |raw_item|
|
33
|
+
threads << Thread.new do
|
34
|
+
@semaphore.pop
|
35
|
+
# puts "#{Time.now} #{@semaphore.size} available tokens. #{@semaphore.num_waiting} threads waiting."
|
36
|
+
|
37
|
+
begin
|
38
|
+
item = Item.new(baseurl: @current_url, raw_item:raw_item) do |item|
|
39
|
+
@on[:item].call(item) if @on[:item]
|
40
|
+
end.run
|
41
|
+
|
42
|
+
if @detail_key
|
43
|
+
detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
|
44
|
+
@on[:detail].call(detail) if @on[:detail]
|
45
|
+
end.run
|
46
|
+
end
|
47
|
+
|
48
|
+
@queue_of_items.push item.prop
|
49
|
+
rescue => e
|
50
|
+
p e
|
51
|
+
ensure
|
52
|
+
@semaphore.push(1)
|
53
|
+
end
|
54
|
+
end # end Thread
|
55
|
+
end # end found.each loop
|
56
|
+
end # end if found
|
36
57
|
|
37
58
|
# find next page[s] in current page
|
38
|
-
if @
|
59
|
+
if @max_page and @pages.size >= @max_page
|
60
|
+
stop
|
61
|
+
elsif @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
|
39
62
|
@on[:next].call(next_url) if @on[:next]
|
40
63
|
from next_url
|
41
64
|
else
|
42
65
|
stop
|
43
66
|
end
|
67
|
+
end # end while @from
|
68
|
+
|
69
|
+
threads.each(&:join)
|
70
|
+
|
71
|
+
threads.size.times do
|
72
|
+
item = @queue_of_items.pop
|
73
|
+
@items << item
|
44
74
|
end
|
45
75
|
|
46
76
|
self
|
@@ -51,18 +81,18 @@ module Samao
|
|
51
81
|
@on[:detail] = block if block
|
52
82
|
end
|
53
83
|
|
54
|
-
def
|
55
|
-
|
84
|
+
def find_item(selector, &block)
|
85
|
+
find(:item, selector, &block)
|
56
86
|
end
|
57
87
|
|
58
88
|
# set front page
|
59
89
|
def from(url)
|
60
|
-
if prev_url = @current_url || @
|
90
|
+
if prev_url = @current_url || @baseurl
|
61
91
|
url = URI.join(prev_url, url)
|
62
92
|
end
|
63
93
|
url = URI(url) if ! url.is_a? URI
|
64
94
|
|
65
|
-
@from = Catcher.new(url:url,
|
95
|
+
@from = Catcher.new(url:url, baseurl:@current_url)
|
66
96
|
@pages << url
|
67
97
|
@current_url = url
|
68
98
|
|
@@ -70,8 +100,22 @@ module Samao
|
|
70
100
|
end
|
71
101
|
|
72
102
|
# set base url
|
73
|
-
def
|
74
|
-
@
|
103
|
+
def baseurl(url)
|
104
|
+
@baseurl = url
|
105
|
+
|
106
|
+
self
|
107
|
+
end
|
108
|
+
|
109
|
+
# set max page
|
110
|
+
def max_page(max)
|
111
|
+
@max_page = max
|
112
|
+
|
113
|
+
self
|
114
|
+
end
|
115
|
+
|
116
|
+
# set max page
|
117
|
+
def max_page(max)
|
118
|
+
@max_page = max
|
75
119
|
|
76
120
|
self
|
77
121
|
end
|
data/lib/samao/item.rb
CHANGED
@@ -8,7 +8,8 @@ module Samao
|
|
8
8
|
@prop = {} # usefull properties
|
9
9
|
@raw = {} # nodes go here.
|
10
10
|
|
11
|
-
@
|
11
|
+
@baseurl = params[:baseurl].to_s
|
12
|
+
|
12
13
|
set_raw :item, params[:raw_item] if params[:raw_item]
|
13
14
|
|
14
15
|
yield self if block_given?
|
@@ -18,8 +19,7 @@ module Samao
|
|
18
19
|
|
19
20
|
def extract
|
20
21
|
@selector.each do |name, sel|
|
21
|
-
|
22
|
-
@on[name].call self if @on[name]
|
22
|
+
found(name, @raw[:item].css(sel))
|
23
23
|
end
|
24
24
|
|
25
25
|
self
|
@@ -31,10 +31,14 @@ module Samao
|
|
31
31
|
end
|
32
32
|
|
33
33
|
def set_url(name, value)
|
34
|
-
value = URI.join @
|
34
|
+
value = URI.join @baseurl, value if @baseurl
|
35
35
|
set(name, value.to_s)
|
36
36
|
end
|
37
37
|
|
38
|
+
def set_baseurl(baseurl=nil)
|
39
|
+
@prop[:baseurl] = baseurl || @baseurl
|
40
|
+
end
|
41
|
+
|
38
42
|
def prop(name=nil)
|
39
43
|
if name
|
40
44
|
return @prop[name]
|
data/lib/samao/matchable.rb
CHANGED
@@ -5,17 +5,38 @@ module Samao
|
|
5
5
|
end
|
6
6
|
|
7
7
|
def matchable
|
8
|
+
@cmd_sets = [:set, :set_url] ## target class should inplements methods as: set(name, value)
|
8
9
|
@selector = {}
|
9
10
|
@on = {}
|
10
11
|
end
|
11
12
|
|
12
|
-
def
|
13
|
+
def find(name, selector, &block)
|
13
14
|
@selector[name] = selector
|
14
15
|
|
15
16
|
@on[name] = block if block
|
16
17
|
|
17
18
|
self
|
18
19
|
end
|
20
|
+
alias match find
|
21
|
+
|
22
|
+
def found(name, value, target=self)
|
23
|
+
cmd = :set
|
24
|
+
|
25
|
+
if @on[name]
|
26
|
+
value = @on[name].call value
|
27
|
+
if value.is_a? Array and @cmd_sets.include?(value[0].to_sym)
|
28
|
+
case value.length
|
29
|
+
when 2
|
30
|
+
cmd, value = value
|
31
|
+
when 3
|
32
|
+
cmd, name, value = value
|
33
|
+
end
|
34
|
+
end
|
35
|
+
target.send cmd, name, value
|
36
|
+
end
|
37
|
+
|
38
|
+
target.send cmd, name, value
|
39
|
+
end
|
19
40
|
|
20
41
|
end
|
21
42
|
end
|
data/lib/samao/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: samao
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Liu Lantao
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-12-
|
11
|
+
date: 2016-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|