samao 0.1.0 → 0.1.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/README.md +10 -6
- data/lib/samao/catcher.rb +2 -1
- data/lib/samao/detail.rb +3 -4
- data/lib/samao/detector.rb +67 -23
- data/lib/samao/item.rb +8 -4
- data/lib/samao/matchable.rb +22 -1
- data/lib/samao/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA1:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a483278b39eb8b7c7f68d9ccd02b9d8fa7bb5cec
|
4
|
+
data.tar.gz: 307181057f12f81fca2a408a85dfa9de6fdedeaf
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: e4608bbab36feb41b82f5779a7c2be74c45349b349b2243cbe30ccf4666c83a95e1407b7439a36c9f4ce0fb3eabc5bd0d2a9cca5b0c6529bde85355ae84afea0
|
7
|
+
data.tar.gz: f181681e70dd14f9607f9a8002e2656522b6f4cb483aa3aa18153f6c2d5bc74619436997e3e1eac5a7d2e3558dfb66fae355bd2c3e73b80a6f39cb40ef8b38c7
|
data/README.md
CHANGED
@@ -29,28 +29,32 @@ Or install it yourself as:
|
|
29
29
|
samao = Samao::Detector.new
|
30
30
|
|
31
31
|
# set base url and start page
|
32
|
-
samao.
|
32
|
+
samao.baseurl 'https://github.com'
|
33
33
|
samao.from '/Lax?tab=repositories'
|
34
34
|
# the following line have the same effect
|
35
35
|
#samao.from 'https://github.com/Lax?tab=repositories'
|
36
36
|
|
37
37
|
# tell samao how to find the next page
|
38
|
-
samao.
|
38
|
+
samao.find :next, 'div.pagination a.next_page'
|
39
|
+
samao.max_page 1
|
39
40
|
|
40
41
|
# tell samao how to find items.
|
41
42
|
# further more, set the data from matched HTML node/element.
|
42
|
-
samao.
|
43
|
+
samao.find_item 'div#user-repositories-list li a[itemprop="name codeRepository"]' do |item|
|
43
44
|
item.set_url :url, item.raw(:item)['href']
|
44
45
|
item.set :title, item.raw(:item).text.strip
|
45
46
|
end
|
46
47
|
|
48
|
+
samao.find_item 'div#user-repositories-list li' do |item|
|
49
|
+
item.find(:url, 'a[itemprop="name codeRepository"]') {|value| [:set_url, :url, value.first['href']] }
|
50
|
+
item.find(:title, 'a[itemprop="name codeRepository"]') {|value| [:set, value.first.text.strip] }
|
51
|
+
end
|
52
|
+
|
47
53
|
# if it need to open content page for more information
|
48
54
|
# default key is :url
|
49
55
|
samao.add_detail :url do |detail|
|
50
56
|
#samao.add_detail do |detail|
|
51
|
-
detail.
|
52
|
-
item.set :author, item.raw(:author).first.text.strip
|
53
|
-
end
|
57
|
+
detail.find(:author, 'h1.public .author a') {|value| value.first.text.strip }
|
54
58
|
end
|
55
59
|
|
56
60
|
# run the detector
|
data/lib/samao/catcher.rb
CHANGED
@@ -10,7 +10,8 @@ module Samao
|
|
10
10
|
@code = 0
|
11
11
|
|
12
12
|
@headers = {}
|
13
|
-
@headers["Referer"] = params[:
|
13
|
+
@headers["Referer"] = params[:baseurl].to_s if params[:baseurl]
|
14
|
+
@headers["User-Agent"] = "Samao/%s; Ruby/%s" % [Samao::VERSION, RUBY_VERSION]
|
14
15
|
|
15
16
|
self
|
16
17
|
end
|
data/lib/samao/detail.rb
CHANGED
@@ -7,8 +7,8 @@ module Samao
|
|
7
7
|
|
8
8
|
@item = params[:item]
|
9
9
|
@url = params[:url]
|
10
|
-
@
|
11
|
-
@catcher = Catcher.new(url:@url,
|
10
|
+
@baseurl = params[:baseurl]
|
11
|
+
@catcher = Catcher.new(url:@url, baseurl: @baseurl)
|
12
12
|
|
13
13
|
yield self if block_given?
|
14
14
|
|
@@ -18,8 +18,7 @@ module Samao
|
|
18
18
|
def run
|
19
19
|
if @catcher and @catcher.run.success? and doc = @catcher.doc
|
20
20
|
@selector.each do |name, sel|
|
21
|
-
|
22
|
-
@on[name].call @item if @on[name]
|
21
|
+
found(name, doc.css(sel), @item)
|
23
22
|
end
|
24
23
|
end
|
25
24
|
|
data/lib/samao/detector.rb
CHANGED
@@ -5,10 +5,16 @@ module Samao
|
|
5
5
|
def initialize(params={})
|
6
6
|
matchable
|
7
7
|
|
8
|
-
@current_url = @
|
8
|
+
@current_url = @baseurl = @from = @max_page = nil
|
9
9
|
@pages = []
|
10
10
|
@items = []
|
11
11
|
|
12
|
+
@queue_of_items = Queue.new
|
13
|
+
|
14
|
+
@semaphore = Queue.new
|
15
|
+
@max_concurrent = params[:max_concurrent] || 5
|
16
|
+
@max_concurrent.times { @semaphore.push(1) }
|
17
|
+
|
12
18
|
yield self if block_given?
|
13
19
|
|
14
20
|
self
|
@@ -16,31 +22,55 @@ module Samao
|
|
16
22
|
|
17
23
|
# return Detector self
|
18
24
|
def run
|
19
|
-
|
25
|
+
threads = []
|
26
|
+
while @from
|
27
|
+
break unless @from.run.success?
|
28
|
+
@current_doc = @from.doc
|
29
|
+
|
20
30
|
# find items in current_page
|
21
31
|
if found = @current_doc.css(@selector[:item]) and found.size >= 1
|
22
|
-
|
23
|
-
|
24
|
-
@
|
25
|
-
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
32
|
-
|
33
|
-
|
34
|
-
|
35
|
-
|
32
|
+
found.each do |raw_item|
|
33
|
+
threads << Thread.new do
|
34
|
+
@semaphore.pop
|
35
|
+
# puts "#{Time.now} #{@semaphore.size} available tokens. #{@semaphore.num_waiting} threads waiting."
|
36
|
+
|
37
|
+
begin
|
38
|
+
item = Item.new(baseurl: @current_url, raw_item:raw_item) do |item|
|
39
|
+
@on[:item].call(item) if @on[:item]
|
40
|
+
end.run
|
41
|
+
|
42
|
+
if @detail_key
|
43
|
+
detail = Detail.new(item: item, url: item.prop(@detail_key)) do |detail|
|
44
|
+
@on[:detail].call(detail) if @on[:detail]
|
45
|
+
end.run
|
46
|
+
end
|
47
|
+
|
48
|
+
@queue_of_items.push item.prop
|
49
|
+
rescue => e
|
50
|
+
p e
|
51
|
+
ensure
|
52
|
+
@semaphore.push(1)
|
53
|
+
end
|
54
|
+
end # end Thread
|
55
|
+
end # end found.each loop
|
56
|
+
end # end if found
|
36
57
|
|
37
58
|
# find next page[s] in current page
|
38
|
-
if @
|
59
|
+
if @max_page and @pages.size >= @max_page
|
60
|
+
stop
|
61
|
+
elsif @selector[:next] and next_url = @current_doc.at_css(@selector[:next]) and next_url = URI.join(@current_url, next_url['href'])
|
39
62
|
@on[:next].call(next_url) if @on[:next]
|
40
63
|
from next_url
|
41
64
|
else
|
42
65
|
stop
|
43
66
|
end
|
67
|
+
end # end while @from
|
68
|
+
|
69
|
+
threads.each(&:join)
|
70
|
+
|
71
|
+
threads.size.times do
|
72
|
+
item = @queue_of_items.pop
|
73
|
+
@items << item
|
44
74
|
end
|
45
75
|
|
46
76
|
self
|
@@ -51,18 +81,18 @@ module Samao
|
|
51
81
|
@on[:detail] = block if block
|
52
82
|
end
|
53
83
|
|
54
|
-
def
|
55
|
-
|
84
|
+
def find_item(selector, &block)
|
85
|
+
find(:item, selector, &block)
|
56
86
|
end
|
57
87
|
|
58
88
|
# set front page
|
59
89
|
def from(url)
|
60
|
-
if prev_url = @current_url || @
|
90
|
+
if prev_url = @current_url || @baseurl
|
61
91
|
url = URI.join(prev_url, url)
|
62
92
|
end
|
63
93
|
url = URI(url) if ! url.is_a? URI
|
64
94
|
|
65
|
-
@from = Catcher.new(url:url,
|
95
|
+
@from = Catcher.new(url:url, baseurl:@current_url)
|
66
96
|
@pages << url
|
67
97
|
@current_url = url
|
68
98
|
|
@@ -70,8 +100,22 @@ module Samao
|
|
70
100
|
end
|
71
101
|
|
72
102
|
# set base url
|
73
|
-
def
|
74
|
-
@
|
103
|
+
def baseurl(url)
|
104
|
+
@baseurl = url
|
105
|
+
|
106
|
+
self
|
107
|
+
end
|
108
|
+
|
109
|
+
# set max page
|
110
|
+
def max_page(max)
|
111
|
+
@max_page = max
|
112
|
+
|
113
|
+
self
|
114
|
+
end
|
115
|
+
|
116
|
+
# set max page
|
117
|
+
def max_page(max)
|
118
|
+
@max_page = max
|
75
119
|
|
76
120
|
self
|
77
121
|
end
|
data/lib/samao/item.rb
CHANGED
@@ -8,7 +8,8 @@ module Samao
|
|
8
8
|
@prop = {} # usefull properties
|
9
9
|
@raw = {} # nodes go here.
|
10
10
|
|
11
|
-
@
|
11
|
+
@baseurl = params[:baseurl].to_s
|
12
|
+
|
12
13
|
set_raw :item, params[:raw_item] if params[:raw_item]
|
13
14
|
|
14
15
|
yield self if block_given?
|
@@ -18,8 +19,7 @@ module Samao
|
|
18
19
|
|
19
20
|
def extract
|
20
21
|
@selector.each do |name, sel|
|
21
|
-
|
22
|
-
@on[name].call self if @on[name]
|
22
|
+
found(name, @raw[:item].css(sel))
|
23
23
|
end
|
24
24
|
|
25
25
|
self
|
@@ -31,10 +31,14 @@ module Samao
|
|
31
31
|
end
|
32
32
|
|
33
33
|
def set_url(name, value)
|
34
|
-
value = URI.join @
|
34
|
+
value = URI.join @baseurl, value if @baseurl
|
35
35
|
set(name, value.to_s)
|
36
36
|
end
|
37
37
|
|
38
|
+
def set_baseurl(baseurl=nil)
|
39
|
+
@prop[:baseurl] = baseurl || @baseurl
|
40
|
+
end
|
41
|
+
|
38
42
|
def prop(name=nil)
|
39
43
|
if name
|
40
44
|
return @prop[name]
|
data/lib/samao/matchable.rb
CHANGED
@@ -5,17 +5,38 @@ module Samao
|
|
5
5
|
end
|
6
6
|
|
7
7
|
def matchable
|
8
|
+
@cmd_sets = [:set, :set_url] ## target class should inplements methods as: set(name, value)
|
8
9
|
@selector = {}
|
9
10
|
@on = {}
|
10
11
|
end
|
11
12
|
|
12
|
-
def
|
13
|
+
def find(name, selector, &block)
|
13
14
|
@selector[name] = selector
|
14
15
|
|
15
16
|
@on[name] = block if block
|
16
17
|
|
17
18
|
self
|
18
19
|
end
|
20
|
+
alias match find
|
21
|
+
|
22
|
+
def found(name, value, target=self)
|
23
|
+
cmd = :set
|
24
|
+
|
25
|
+
if @on[name]
|
26
|
+
value = @on[name].call value
|
27
|
+
if value.is_a? Array and @cmd_sets.include?(value[0].to_sym)
|
28
|
+
case value.length
|
29
|
+
when 2
|
30
|
+
cmd, value = value
|
31
|
+
when 3
|
32
|
+
cmd, name, value = value
|
33
|
+
end
|
34
|
+
end
|
35
|
+
target.send cmd, name, value
|
36
|
+
end
|
37
|
+
|
38
|
+
target.send cmd, name, value
|
39
|
+
end
|
19
40
|
|
20
41
|
end
|
21
42
|
end
|
data/lib/samao/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: samao
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1.
|
4
|
+
version: 0.1.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Liu Lantao
|
8
8
|
autorequire:
|
9
9
|
bindir: exe
|
10
10
|
cert_chain: []
|
11
|
-
date: 2016-12-
|
11
|
+
date: 2016-12-20 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: nokogiri
|